# The accuracy of Python naive Bayes is too low

2022-02-02 11:04:16

``# -*- coding: utf-8 -*-"""Created on Sat Dec 11 12:19:57 [email protected]: peng"""import pandas as pdimport numpy as npclass Bayes(object):    def data_split(self):     # The data set is randomly divided into test set and training set         full_list=pd.read_csv('E:\pythoncodes\pandas\iris.csv')        full_list_list=np.array(full_list) # Convert to array         total=len(full_list_list)        offSet=int(total*0.2)        col_rand_array = np.arange(full_list_list.shape)        np.random.shuffle(col_rand_array)        second_list= full_list_list[:,col_rand_array[0:offSet]]        #print(second_list)        first_list=full_list_list[:,col_rand_array[offSet:full_list_list.shape-1]]        #print(first_list)        return first_list,second_list    def getTrainData(self):        trainData=first_list[0:first_list.shape-1,1:first_list.shape]   # Training data x1,x2,        #print(trainData)        labels=first_list[first_list.shape-1,1:first_list.shape]   # Training data corresponds to y        #print(labels)        return trainData,labels        #print(trainData)    # Find a priori probability      def classify(self,trainData,labels,features,aim):        labels=list(labels)        labelSet=set(labels)   #y Value classification             #print(label)        P_y={}  #y Probability         P={}        for label in labelSet:            P_y[label]=labels.count(label)/float(len(labels))        #print(P_y)         # Find joint probability         P_xy={}        for y in P_y.keys():            y_index = [i for i, label in enumerate(labels) if label == y]  # labels It appears that y Subscript index of all values of the value             for j in range(len(features)):      # features  stay trainData[:,0] All subscript indexes of the values appearing in                 x_index = [i for i, feature in enumerate(trainData[j,:]) if feature == features[j]]                xy_count = len(set(x_index) & set(y_index))   # set(x_index)&set(y_index) List the same elements of two tables                 xey = str(features[j]) + '*' + str(y)                P_xy[xey] = xy_count / float(len(labels))        for y in P_y.keys():            for x in features:                xey = str(x) + '|' + str(y)                P[xey] = P_xy[str(x)+'*'+str(y)] / float(P_y[y])       # Distinguish         F={}        #print(features[len(features)-1])        for y in P_y:            F[y]=P_y[y]            for x in features:                F[y]=F[y]*P[xey]               # print(F[y],x)        testResult=max(F,key=F.get)        #print(testResult)        print(testResult,aim)        testResult=1        return testResultif __name__ == '__main__':    TR_total = 0    TN_total = 0    NB=Bayes()    first_list,second_list=NB.data_split()    trainData,labels=NB.getTrainData()    rest_data=second_list[:4]    #print(rest_data)    for n in range(len(rest_data)):           result=NB.classify(trainData, labels, rest_data[:,n],second_list[4,n])            #print(result)                                                                        ``