hw1_Prob3_with_sklearn

hw1_Prob3_with_sklearn

Prob 3-1 w/. Sklearn

In [21]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt
import scipy
In [22]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import linear_model

Import Data,建立training set and testing set

In [23]:
# x1:T, x2:V, x3:AP, x4:RH, x0=1
data3 = pd.read_excel("data.xlsx")
data3['x0'] = 1      #增加一排常數項當作x0
#data = data3.head()
data = data3
train_x    = np.matrix( data.drop('EP', 1)[0:400] )
train_y    = np.array( data.EP[0:400] )
test_x    = np.matrix( data.drop('EP', 1)[400:500] )
test_y    = np.array( data.EP[400:500] )
train_df = data[0:400] 
test_df = data[400:500]
D = 5    # dim (T,V,AP,RH) 維度是4, 加上x0維度是5
In [28]:
def Reg(degree):
    #---------build up model---------------------------
    reg = linear_model.LinearRegression()
    est = make_pipeline(PolynomialFeatures(degree), reg)
    est.fit(train_x, train_y)
    
    #---------find coeff---------------------------
    #print est.steps
    coef = est.steps[-1][1].coef_#.ravel()
    coef_pair = zip(np.abs(coef),range(coef.size))
    feature_names = est.steps[0][1].get_feature_names()
    #print feature_names
    
    #---------plot coeff---------------------------
    print 'plot:'
    plt.plot(range(coef.size),np.abs(coef),'b-')
    plt.xlabel('coef order')
    plt.ylabel('abs(w)')
    plt.title('Coef Weight')
    plt.show()
    #plt.close()
    
    #---------show coeff---------------------------    
    coef_pair.sort()
    rank = [ coef_pair[-i] for i in range(1,7) ]
    print 'coef=',rank
    print 'feature_names=', [ feature_names[pair[1]] for pair in rank ]
    print '*'*64

    #---------predict y from test_x---------------------
    n = test_df['EP'].count()
    s_name = 'pred_y_deg'+str(degree)+'_byTest'
    test_df[s_name] = est.predict(test_x)
    RMS_test = np.sqrt(np.sum( (test_df[s_name] - test_df['EP'])**2 ) /n )
    print test_df.head()

    #---------predict y from train_x---------------------
    n = train_df['EP'].count()
    s_name = 'pred_y_deg'+str(degree)+'_byTrain'
    train_df[s_name] = est.predict(train_x)
    RMS_train = np.sqrt(np.sum( (train_df[s_name] - train_df['EP'])**2 ) /n )
    print train_df.head()    
    
    return [RMS_train,RMS_test]
In [29]:
deg =2
RMS = Reg(deg)  #算出來=4.17413184555, 跟原本w/o sklearn的答案4.17413184559極為近似
print '*'*64
print 'RMS=', RMS
plot:
coef= [(11.937702827276379, 1), (11.937702827270476, 10), (6.4896595627186322, 3), (6.4896595627147926, 17), (1.0049661217227708, 2), (1.0049661217210981, 14)]
feature_names= ['x0', 'x0 x4', 'x2', 'x2 x4', 'x1', 'x1 x4']
****************************************************************
         T      V       AP     RH      EP  x0  pred_y_deg2_byTest
400  17.29  44.06  1016.24  77.56  461.38   1          460.083923
401  17.46  40.22  1006.70  91.96  461.74   1          458.390045
402  11.56  40.43  1025.48  74.75  489.54   1          472.619129
403   7.51  41.01  1024.61  97.41  477.61   1          478.499020
404  17.08  38.58  1015.41  73.42  461.49   1          463.309447
       T      V       AP     RH      EP  x0  pred_y_deg2_byTrain
0   8.34  40.77  1010.84  90.01  480.48   1           479.718500
1  23.64  58.49  1011.40  74.20  445.75   1           444.289314
2  29.74  56.90  1007.15  41.91  438.76   1           437.117475
3  19.07  49.69  1007.22  76.79  453.09   1           453.610248
4  11.80  40.66  1017.13  97.20  464.43   1           469.758943
****************************************************************
RMS= [3.8449709277448352, 4.1741318455462775]
C:\Users\Phoebe Huang\Anaconda2\lib\site-packages\ipykernel\__main__.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Phoebe Huang\Anaconda2\lib\site-packages\ipykernel\__main__.py:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [30]:
deg =3
RMS =  Reg(deg) #算出來 4.08913247008, 跟原本4.0766289726稍有落差
print '*'*64
print 'RMS=', RMS
plot:
coef= [(1384.938866313649, 3), (1384.9388311559953, 17), (1384.9388311331727, 51), (1141.2757909896736, 35), (1141.2757814906367, 1), (1141.2757639017473, 10)]
feature_names= ['x2', 'x2 x4', 'x2 x4^2', 'x0 x4^2', 'x0', 'x0 x4']
****************************************************************
         T      V       AP     RH      EP  x0  pred_y_deg2_byTest  \
400  17.29  44.06  1016.24  77.56  461.38   1          460.083923   
401  17.46  40.22  1006.70  91.96  461.74   1          458.390045   
402  11.56  40.43  1025.48  74.75  489.54   1          472.619129   
403   7.51  41.01  1024.61  97.41  477.61   1          478.499020   
404  17.08  38.58  1015.41  73.42  461.49   1          463.309447   

     pred_y_deg3_byTest  
400          461.165770  
401          458.609422  
402          470.769532  
403          477.418084  
404          462.963611  
       T      V       AP     RH      EP  x0  pred_y_deg2_byTrain  \
0   8.34  40.77  1010.84  90.01  480.48   1           479.718500   
1  23.64  58.49  1011.40  74.20  445.75   1           444.289314   
2  29.74  56.90  1007.15  41.91  438.76   1           437.117475   
3  19.07  49.69  1007.22  76.79  453.09   1           453.610248   
4  11.80  40.66  1017.13  97.20  464.43   1           469.758943   

   pred_y_deg3_byTrain  
0           480.429046  
1           444.334063  
2           437.276086  
3           452.852912  
4           469.277248  
****************************************************************
RMS= [3.5999502326134096, 4.0891324700768052]
C:\Users\Phoebe Huang\Anaconda2\lib\site-packages\ipykernel\__main__.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Phoebe Huang\Anaconda2\lib\site-packages\ipykernel\__main__.py:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Prob 3-2  

For M = 3 (order=3), use L1-regulization

In [ ]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import linear_model
degree = 3
def L1_Reg(alpha):
    est = make_pipeline(PolynomialFeatures(degree), linear_model.Lasso(alpha=alpha))
    est.fit(train_x, train_y)
    #print est.steps
    coef = est.steps[-1][1].coef_#.ravel()
    coef_pair = zip(np.abs(coef),range(coef.size))
    plt.plot(range(coef.size),np.abs(coef),'b-')
    plt.xlabel('coef order')
    plt.ylabel('abs(w)')
    plt.title('Coef Weight')
    #plt.show()
    #plt.close()
    coef_pair.sort()
    rank = [ coef_pair[-i] for i in range(1,7) ]
    print 'alpha=',alpha,'coef=',rank
    #E_W = alpha/2*np.sum([np.abs(i) for i in coef])
    #print E_W
    feature_names = est.steps[0][1].get_feature_names()
    #print feature_names
    print [ feature_names[pair[1]] for pair in rank ]
    print '*'*64
In [ ]:
for j in range(1000,10000,1000):  #設定lenda範圍
    lenda = (float)(j)/10000000 
    L1_Reg(lenda)
In [ ]:
 

沒有留言:

張貼留言