May-12-2018, 11:21 PM
Need help creating a function that cleans data and puts frequency in dictionary.
import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string #create a function and dictionary def clean_data(tokenizeFreq) token_frequency_dic = {} # load data article = open('sample_data.txt','r') text = article.read() file.close() # split into words tokens = word_tokenize(text) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words and sort stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] words.sort() # print frequency distribution req = nltk.FreqDist(words) for k,v in req.items(): print(str(k) + ': ' + str(v))can this be condense into a for loop...