How to cluster dataset

neha_garg · (This post was last modified: Nov-14-2019, 10:22 AM by Larz60+.)

Dear Friends,

I am trying clustering of dataset, however my code is ploting centroid but I didn't find the way to plot clusters and store the data of clustering in csv format.
Please help!

import tweepy
import numpy as np
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os
import json
import re
import pandas as pd
import csv
import re #regular expressionfrom textblob import TextBlob
import string
import preprocessor as p
import textblob
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
#import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


#Twitter credentials for the app
consumer_key = ''
consumer_secret = ''
access_key= ''
access_secret = ''

#pass twitter credentials to tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#columns of the csv file
COLS = ['id', 'created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang',
        'favorite_count', 'retweet_count', 'original_author', 'possibly_sensitive', 'hashtags',
        'user_mentions', 'place', 'place_coord_boundaries']

df1 = pd.read_csv('C:/Users/Neha Garg/AppData/Local/Programs/Python/data/chandrayan1.csv',encoding ='unicode_escape')
df=pd.DataFrame(data={"hashtags":["chandrayan","chandrayan2","pulwama","Indiafail","Medicine","Health","ISRO"]})
df['hashtags']=df['hashtags'].astype('category').cat.codes
#Make a copy of DF
df_tr = df
kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0).fit(df)
y = kmeans.predict(df)


labels = kmeans.labels_

#Glue back to originaal data
df_tr['clusters'] = labels


centroids=kmeans.cluster_centers_
print(centroids)
#plt.scatter(df[:,0],df[:,6], c= y,s=50,alpha=0.5, cmap='viridis')
plt.scatter(centroids[:, 0], centroids[:, 0], c='red', s=50,alpha=0.5)
plt.show()

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	updating cluster of elements based on the max value of distance	alex80	0	1,597	Oct-02-2020, 11:11 AM Last Post: alex80
	using silhouette score for each sample of an array with each cluster	alex80	1	2,862	Sep-25-2020, 11:35 PM Last Post: scidam

How to cluster dataset

User Panel Messages

Announcements