Python Forum
How to cluster dataset - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Data Science (https://python-forum.io/forum-44.html)
+--- Thread: How to cluster dataset (/thread-22469.html)



How to cluster dataset - neha_garg - Nov-14-2019

Dear Friends,

I am trying clustering of dataset, however my code is ploting centroid but I didn't find the way to plot clusters and store the data of clustering in csv format.
Please help!
import tweepy
import numpy as np
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os
import json
import re
import pandas as pd
import csv
import re #regular expressionfrom textblob import TextBlob
import string
import preprocessor as p
import textblob
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
#import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


#Twitter credentials for the app
consumer_key = ''
consumer_secret = ''
access_key= ''
access_secret = ''

#pass twitter credentials to tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#columns of the csv file
COLS = ['id', 'created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang',
        'favorite_count', 'retweet_count', 'original_author', 'possibly_sensitive', 'hashtags',
        'user_mentions', 'place', 'place_coord_boundaries']

df1 = pd.read_csv('C:/Users/Neha Garg/AppData/Local/Programs/Python/data/chandrayan1.csv',encoding ='unicode_escape')
df=pd.DataFrame(data={"hashtags":["chandrayan","chandrayan2","pulwama","Indiafail","Medicine","Health","ISRO"]})
df['hashtags']=df['hashtags'].astype('category').cat.codes
#Make a copy of DF
df_tr = df
kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0).fit(df)
y = kmeans.predict(df)


labels = kmeans.labels_

#Glue back to originaal data
df_tr['clusters'] = labels


centroids=kmeans.cluster_centers_
print(centroids)
#plt.scatter(df[:,0],df[:,6], c= y,s=50,alpha=0.5, cmap='viridis')
plt.scatter(centroids[:, 0], centroids[:, 0], c='red', s=50,alpha=0.5)
plt.show()