Python Forum

Full Version: Machine Learning errors
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hi everyone, I'm currently in the process of trying to use machine learning to sort through emails but whenever I run the code, I keep getting errors, one such error is saying 'list' object has no attribute 'most_common'. I have tried to figure out why it is doing this but can not figure out why nor how to fix it. Below is my code, if anyone can help I'd be really thankful. The error appears at the dictionary = dictionary.most_common side. I dont know if there are any more errors as the program will not go past this part

import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC

def make_dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]
    all_words = []
    for mail in emails:
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:
                    words = line.split()
                    all_words += words
    dictionary = Counter(all_words)
    return dictionary
train_dir = 'train-mails'
dictionary = make_dictionary(train_dir)
list_to_remove = dictionary.keys()
for item in list_to_remove:
    if item.isalpha() == False:
        del dictionary[item]
    elif len(item) == 1:
        del dictionary[item]
    dictionary = dictionary.most_common(3000)

def extract_features(mail_dir):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i,line in enumerate(fi):
                if i == 2:
                    words = line.split()
                for word in words:
                  wordID = 0
                  for i,d in enumerate(dictionary):
                    if d[0] == word:
                      wordID = i
                      features_matrix[docID,wordID] = words.count(word)
            docID = docID + 1
        return features_matrix

train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)


model1 = MultinomialNB()
model2 = LinearSVC()
model1.fit(train_matrix,train_labels)
model2.fit(train_matrix,train_labels)

test_dir = 'test-mails'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)
print confusion_matrix(test_labels,result1)
print confusion_matrix(test_labels,result2)
please post full traceback in error tags