Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
How to cluster dataset
#1
Dear Friends,

I am trying clustering of dataset, however my code is ploting centroid but I didn't find the way to plot clusters and store the data of clustering in csv format.
Please help!
import tweepy
import numpy as np
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import os
import json
import re
import pandas as pd
import csv
import re #regular expressionfrom textblob import TextBlob
import string
import preprocessor as p
import textblob
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
#import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


#Twitter credentials for the app
consumer_key = ''
consumer_secret = ''
access_key= ''
access_secret = ''

#pass twitter credentials to tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#columns of the csv file
COLS = ['id', 'created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang',
        'favorite_count', 'retweet_count', 'original_author', 'possibly_sensitive', 'hashtags',
        'user_mentions', 'place', 'place_coord_boundaries']

df1 = pd.read_csv('C:/Users/Neha Garg/AppData/Local/Programs/Python/data/chandrayan1.csv',encoding ='unicode_escape')
df=pd.DataFrame(data={"hashtags":["chandrayan","chandrayan2","pulwama","Indiafail","Medicine","Health","ISRO"]})
df['hashtags']=df['hashtags'].astype('category').cat.codes
#Make a copy of DF
df_tr = df
kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0).fit(df)
y = kmeans.predict(df)


labels = kmeans.labels_

#Glue back to originaal data
df_tr['clusters'] = labels


centroids=kmeans.cluster_centers_
print(centroids)
#plt.scatter(df[:,0],df[:,6], c= y,s=50,alpha=0.5, cmap='viridis')
plt.scatter(centroids[:, 0], centroids[:, 0], c='red', s=50,alpha=0.5)
plt.show()
Reply


Messages In This Thread
How to cluster dataset - by neha_garg - Nov-14-2019, 07:38 AM

Possibly Related Threads…
Thread Author Replies Views Last Post
  updating cluster of elements based on the max value of distance alex80 0 1,575 Oct-02-2020, 11:11 AM
Last Post: alex80
  using silhouette score for each sample of an array with each cluster alex80 1 2,823 Sep-25-2020, 11:35 PM
Last Post: scidam

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020