Python Forum
Data mining code help
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Data mining code help
#1
Dear,

for an assignment we need to data mine some data from an excel sheet provided. Also we got some code provided by our professor.
Me and my team aren't really skilled in Python so thats why I'm asking you this question.

We got 2 numerical columns and 5 catogorical columns. And the assignment is to get the:
Minimum
Maximum
Median
Mean
of the columns with numerical features

and the 'classes'of the rows with catogorical features.

Now we can get the anwsers by changing the row number in the code but we have to provide the anwsers together in one code.
So how can we get the anwers for all of the rows together nice and neat without copying the whole code 7times.


PS. The print mean function gives an error that we also can't fix.


# -*- coding: utf-8 -*-
"""
Created on Wed Sep 11 14:04:58 2019

@author: Group_40 
		
		
		
"""
#from Read_csvdata import ReadData
from InformationGain import FindInformationGain
from SplitSet import SplitSetOnNumFeature,SplitSetOnCatFeature,FindMedianThreshold
import numpy as np 
from math import log, e 
   
import statistics
import csv
from statistics import mean

###############################################################################

   
#################################################################################
def ReadData(file_name):
    # Argument:
    # 1-file_name is a string of the file name with local folder
        
    

  
    Dataset = [] 
    TARGETS = []
 
    

    headercheckdone = False # make True if data file has header.
    instancelength = 0

    with open(file_name) as csv_file:
    
        
        csv_reader = csv.reader(csv_file, delimiter=';')
        
        headercheckdone = False
        featuretypes = []
        featuretypecheck = False

        for row in csv_reader:
  
            if headercheckdone is False:
                headercheckdone = True                 
                if RepresentsInt(row[3]) is False and RepresentsFloat(row[3]) is False:
                    if row[3].find('Class') > -1 or row[3].find('target') > -1:
                        print('Data set has header')
                        continue
                   
                         
      
                # features are numerical, except the target
            datapoint = []
            
            if featuretypecheck == False:
                
                instancelength = len(row)
                for item in range(len(row)-1):
                                       
                    if RepresentsInt(row[item]) or RepresentsFloat(row[item]):
                        featuretypes.append('Numerical')
                    else:
                        featuretypes.append('Categorical')
                
                print('Feature types: ',featuretypes)
                featuretypecheck = True        
            else:
                if len(row) < instancelength:
                    continue
                
          # print('instance length: ', len(row))   
                #print(row)
            index = 0    
            insert = True
            for f in range(len(row)-1):
                #print('index',index,'feature',row[f])
                if str(row[f]).find('?') > -1:
                    insert = False
                if featuretypes[index] == 'Numerical':
                    try: 
                        featureval = float(row[f])
                        datapoint.append(featureval)
                    except ValueError:
                        insert = False
                        continue
                else:
                    datapoint.append(row[f])
                index+=1
    
            datapoint.append(row[3])

            if insert:
                if row[3] not in TARGETS:
                    TARGETS.append(row[3])

               

                Dataset.append(datapoint)
           

#this is what we added to the pofessors code   
    print('# Instances: ',len(Dataset))
    print('Target classes: ',TARGETS)
    print('Maximum is:' , max(TARGETS)) 
    print('Minimum is:' , min(TARGETS)) 
    print('Median is :', statistics.median(TARGETS))
#    print('Mean is:', statistics.mean(TARGET))
    print('Classes are:', TARGETS)
    
    # DATA_TABLE: keeps all instances in an array 
    # An instance is an array itself whose last item is the target and others are features/attributes
    
    # featuretypes: An array of strings that indicate if a feature is 'Numerical' or Categorical'
    return Dataset,featuretypes

################################################################################
    
def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def RepresentsFloat(s):
    try: 
        float(s)
        return True
    except ValueError:
        return False
#################################################################################
def printDataSet(Dataset):
    
    print('Data set array')
    insno = 1
    for ins in Dataset:
        print(insno,':',ins)
        insno+=1
     
    return
#################################################################################
def main():

    print('1BK40 Python Library code: Reading data set csv format')
    print('> Data set with numerical/categorical target')
    
    # if the csv file is in a directory, say 'datafiles', use the following
    #csvdirectory = 'datafiles/'
    
    # if csv file is in the same folder, use the following: 
    csvdirectory = ''
    
    # name of the csv file
    filename = 'AdventourData.csv'
    
    print('-----------------------------------------------')
    print('Data set: '+filename)
    
    Dataset,featuretypes = ReadData(csvdirectory+filename)
    #printDataSet(Dataset)
    

# To see how file reading works, you may uncomment the following. 
main() 
#Important: the main() when you import this file from another one. Otherwise, main function will be executed. 
Reply
#2
I would use Pandas library for this assignment.
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020