Python - Soy novat en python y no puedo correr un codigo en ANACONDA alguien podria ayudarme o explicarme?

   
Vista:

Soy novat en python y no puedo correr un codigo en ANACONDA alguien podria ayudarme o explicarme?

Publicado por kami (1 intervención) el 03/06/2017 05:57:02
El código lo obtuve de esta pagina:
http://goelhardik.github.io/2016/10/04/fishers-lda/

-descargue e instale python 2.7
-lo primero que me apareció es que faltaba la libreria pandas, buscando en internet encontré en el sitio oficial que la forma mas facil de obtenerlo para windows era descargando anaconda xq ya lo tenia

-En fin no puedo y no se como hacerle para correr este programa
Algun@ de ustedes podrá ayudarme??
Pues necesito un ejemplo de programa FISHER LDA y ando mas que LOST
muchas gracias a tod@s por su amable atención

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import sys, itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
import random
 
class LDA():
    def __init__(self, data, num_dims = 1, convert_data = 0,
                 percentile = 50, threshold = 0, labelcol = -1, split_ratio = 0.9):
        '''
        Class constructor.
        --------------------------------
        data : the entire dataset
        num_dims : number of dimensions to project data into
        convert_data : flag to specify whether the data is to be converted to categorical
        percentile : if convert_data is True, then specify the percentile for conversion
        threshold : flag to indicate whether to do thresholding or gaussian modeling for classification
        labelcol : which column in the csv data contains the label
        split_ratio : split ratio for train-test split
        '''
        self.data = data
        self.num_dims = num_dims
        self.convert_data = convert_data
        self.percentile = percentile
        self.threshold = threshold
        self.labelcol = labelcol
        self.split_ratio = split_ratio
        if (self.convert_data):
            self.data = self.to_categorical(self.data, self.percentile)
 
    '''
    Function to convert data to categorical.
    To be used only for the boston dataset.
    '''
    def to_categorical(self, data, percentile):
        fraction = percentile / 100.0
 
        # partition data based on percentile of label column
        med = data.ix[:, self.labelcol].quantile(fraction)
        for i in range(data.shape[0]):
            if (data.ix[i, self.labelcol] >= med):
                data.ix[i, self.labelcol] = 1
            else:
                data.ix[i, self.labelcol] = 0
 
        return data
 
    '''
    Utility function to drop some column from the given pandas dataframe.
    '''
    def drop_col(self, data, col):
        return data.drop(data.columns[[col]], axis = 1)
 
    '''
    Main function to apply LDA
    '''
    def fit(self):
        # Function estimates the LDA parameters
        def estimate_params(data):
            # group data by label column
            grouped = data.groupby(self.data.ix[:,self.labelcol])
 
            # calculate means for each class
            means = {}
            for c in self.classes:
                means[c] = np.array(self.drop_col(self.classwise[c], self.labelcol).mean(axis = 0))
 
            # calculate the overall mean of all the data
            overall_mean = np.array(self.drop_col(data, self.labelcol).mean(axis = 0))
 
            # calculate between class covariance matrix
            # S_B = \sigma{N_i (m_i - m) (m_i - m).T}
            S_B = np.zeros((data.shape[1] - 1, data.shape[1] - 1))
            for c in means.keys():
                S_B += np.multiply(len(self.classwise[c]),
                                   np.outer((means[c] - overall_mean),
                                            (means[c] - overall_mean)))
 
            # calculate within class covariance matrix
            # S_W = \sigma{S_i}
            # S_i = \sigma{(x - m_i) (x - m_i).T}
            S_W = np.zeros(S_B.shape)
            for c in self.classes:
                tmp = np.subtract(self.drop_col(self.classwise[c], self.labelcol).T, np.expand_dims(means[c], axis=1))
                S_W = np.add(np.dot(tmp, tmp.T), S_W)
 
            # objective : find eigenvalue, eigenvector pairs for inv(S_W).S_B
            mat = np.dot(np.linalg.pinv(S_W), S_B)
            eigvals, eigvecs = np.linalg.eig(mat)
            eiglist = [(eigvals[i], eigvecs[:, i]) for i in range(len(eigvals))]
 
            # sort the eigvals in decreasing order
            eiglist = sorted(eiglist, key = lambda x : x[0], reverse = True)
 
            # take the first num_dims eigvectors
            w = np.array([eiglist[i][1] for i in range(self.num_dims)])
 
            self.w = w
            self.means = means
            return
 
 
        # perform train-test split
        traindata = []
        testdata = []
        # group data by label column
        grouped = data.groupby(self.data.ix[:,self.labelcol])
        self.classes = [c for c in grouped.groups.keys()]
        self.classwise = {}
        for c in self.classes:
            self.classwise[c] = grouped.get_group(c)
            rows = random.sample(self.classwise[c].index,
                                     int(self.classwise[c].shape[0] *
                                     self.split_ratio))
            traindata.append(self.classwise[c].ix[rows])
            testdata.append(self.classwise[c].drop(rows))
 
        traindata = pd.concat(traindata)
        testdata = pd.concat(testdata)
 
        # estimate the LDA parameters
        estimate_params(traindata)
        # perform classification on test set
        # if the method is threshold
        if (self.threshold):
            self.calculate_threshold()
            # append the training and test error rates for this iteration
            trainerror = self.calculate_score(traindata) / float(traindata.shape[0])
 
            testerror = self.calculate_score(testdata) / float(testdata.shape[0])
 
        # if the method is gaussian modeling
        else:
            self.gaussian_modeling()
            # append the training and test error rates for this iteration
            trainerror = self.calculate_score_gaussian(traindata) / float(traindata.shape[0])
            testerror = self.calculate_score_gaussian(testdata) / float(testdata.shape[0])
 
        return trainerror, testerror
 
    '''
    Function to calculate the classification threshold.
    Projects the means of the classes and takes their mean as the threshold.
    Also specifies whether values greater than the threshold fall into class 1 
    or class 2.
    '''
    def calculate_threshold(self):
        # project the means and take their mean
        tot = 0
        for c in self.means.keys():
            tot += np.dot(self.w, self.means[c])
        self.w0 = 0.5 * tot
 
        # for 2 classes case; mark if class 1 is >= w0 or < w0
        c1 = self.means.keys()[0]
        c2 = self.means.keys()[1]
        mu1 = np.dot(self.w, self.means[c1])
        if (mu1 >= self.w0):
            self.c1 = 'ge'
        else:
            self.c1 = 'l'
 
    '''
    Function to calculate the scores in thresholding method.
    Assigns predictions based on the calculated threshold.
    '''
    def calculate_score(self, data):
        inputs = self.drop_col(data, self.labelcol)
        # project the inputs
        proj = np.dot(self.w, inputs.T).T
        # assign the predicted class
        c1 = self.means.keys()[0]
        c2 = self.means.keys()[1]
        if (self.c1 == 'ge'):
            proj = [c1 if proj[i] >= self.w0 else c2 for i in range(len(proj))]
        else:
            proj = [c1 if proj[i] < self.w0 else c2 for i in range(len(proj))]
        # calculate the number of errors made
        errors = (proj != data.ix[:, self.labelcol])
        return sum(errors)
 
    '''
    Function to estimate gaussian models for each class.
    Estimates priors, means and covariances for each class.
    '''
    def gaussian_modeling(self):
        self.priors = {}
        self.gaussian_means = {}
        self.gaussian_cov = {}
 
        for c in self.means.keys():
            inputs = self.drop_col(self.classwise[c], self.labelcol)
            proj = np.dot(self.w, inputs.T).T
            self.priors[c] = inputs.shape[0] / float(self.data.shape[0])
            self.gaussian_means[c] = np.mean(proj, axis = 0)
            self.gaussian_cov[c] = np.cov(proj, rowvar=False)
 
    '''
    Utility function to return the probability density for a gaussian, given an 
    input point, gaussian mean and covariance.
    '''
    def pdf(self, point, mean, cov):
        cons = 1./((2*np.pi)**(len(point)/2.)*np.linalg.det(cov)**(-0.5))
        return cons*np.exp(-np.dot(np.dot((point-mean),np.linalg.inv(cov)),(point-mean).T)/2.)
 
    '''
    Function to calculate error rates based on gaussian modeling.
    '''
    def calculate_score_gaussian(self, data):
        classes = sorted(list(self.means.keys()))
        inputs = self.drop_col(data, self.labelcol)
        # project the inputs
        proj = np.dot(self.w, inputs.T).T
        # calculate the likelihoods for each class based on the gaussian models
        likelihoods = np.array([[self.priors[c] * self.pdf([x[ind] for ind in
                                                            range(len(x))], self.gaussian_means[c],
                               self.gaussian_cov[c]) for c in
                        classes] for x in proj])
        # assign prediction labels based on the highest probability
        labels = np.argmax(likelihoods, axis = 1)
        errors = np.sum(labels != data.ix[:, self.labelcol])
        return errors
 
    def plot_bivariate_gaussians(self):
        classes = list(self.means.keys())
        colors = cm.rainbow(np.linspace(0, 1, len(classes)))
        plotlabels = {classes[c] : colors[c] for c in range(len(classes))}
 
        fig = plt.figure()
        ax3D = fig.add_subplot(111, projection='3d')
        for c in self.means.keys():
            data = np.random.multivariate_normal(self.gaussian_means[c],
                                                 self.gaussian_cov[c], size=100)
            pdf = np.zeros(data.shape[0])
            cons = 1./((2*np.pi)**(data.shape[1]/2.)*np.linalg.det(self.gaussian_cov[c])**(-0.5))
            X, Y = np.meshgrid(data.T[0], data.T[1])
            def pdf(point):
                return cons*np.exp(-np.dot(np.dot((point-self.gaussian_means[c]),np.linalg.inv(self.gaussian_cov[c])),(point-self.gaussian_means[c]).T)/2.)
 
            zs = np.array([pdf(np.array(ponit)) for ponit in zip(np.ravel(X),
                                                                   np.ravel(Y))])
            Z = zs.reshape(X.shape)
            surf = ax3D.plot_surface(X, Y, Z, rstride=1, cstride=1,
                                       color=plotlabels[c], linewidth=0,
                                       antialiased=False)
        plt.show()
 
    def plot_proj_1D(self, data):
        classes = list(self.means.keys())
        colors = cm.rainbow(np.linspace(0, 1, len(classes)))
        plotlabels = {classes[c] : colors[c] for c in range(len(classes))}
 
        fig = plt.figure()
        for i, row in data.iterrows():
            proj = np.dot(self.w, row[:self.labelcol])
            plt.scatter(proj, np.random.normal(0,1,1)+0, color =
                        plotlabels[row[self.labelcol]])
        plt.show()
 
    def plot_proj_2D(self, data):
        classes = list(self.means.keys())
        colors = cm.rainbow(np.linspace(0, 1, len(classes)))
        plotlabels = {classes[c] : colors[c] for c in range(len(classes))}
 
        fig = plt.figure()
        for i, row in data.iterrows():
            proj = np.dot(self.w, row[:self.labelcol])
            plt.scatter(proj[0], proj[1], color =
                        plotlabels[row[self.labelcol]])
        plt.show()
 
if __name__ == '__main__':
    data = pd.read_csv(sys.argv[1])
    labelcol = int(sys.argv[2])
    lda = LDA(data, num_dims=2, convert_data=0, threshold=0, labelcol=labelcol)
    trainerror, testerror = lda.fit()
    print(trainerror)
    print(testerror)
    #print(verifyLDA(data, data, labelcol))
    lda.plot_proj_2D(data)
    lda.plot_bivariate_gaussians()

pandas
Valora esta pregunta
Me gusta: Está pregunta es útil y esta claraNo me gusta: Está pregunta no esta clara o no es útil
0
Responder