Third Week

In [189]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In the clean option file. We have removed duplicate values, and added sp500 price of previous day and option vol of previous day.

In [190]:
optiondata = pd.read_table("cleanoption", delimiter = ",")
# optiondata.isnull().any() "This shows no cell is empty"
print optiondata.head(5)
print optiondata.shape
    todaydate  settledate  daystosettle  today_sp_price  settle_sp_price  \
0  2013-07-10  2013-07-26            16         1652.62          1690.47   
1  2013-07-10  2013-07-26            16         1652.62          1690.47   
2  2013-07-11  2013-07-26            15         1675.02          1690.47   
3  2013-07-11  2013-07-26            15         1675.02          1690.47   
4  2013-07-15  2013-07-26            11         1682.50          1690.47   

  optiontype  optionstrike  optionclosingprice  optionhighprice  \
0        put          1580               2.325              2.5   
1       call          1650              19.200             19.8   
2        put          1580               1.050              1.2   
3       call          1650              33.150             33.8   
4        put          1610               1.050              1.1   

   optionlowprice  optionvol  optionopenint  range40  prevPrice1d  prevVol1d  
0            2.15        107           6316        1      1652.32        128  
1           18.60       2009           5446        1      1652.32       4278  
2            0.90         78           6329        1      1652.62        107  
3           32.50       3315           6595        1      1652.62       2009  
4            1.00       4594           5946        1      1675.02         69  
(47052, 15)

Further, we want to group data using 'todaydate' and 'settledate'. So each row should be identified by unique pairs of (settledate, todaydate). df_multi is multi-index dataframe identified by ['settledate','todaydate','optionstrike', 'optiontype'], so each row correspinding to one specific option and we want to combine rows with same (settledate, todaydate) pairs to be our training and testing data. In df1, we count how many options belongs to one specific (settledate, todaydate) pair and draw a line to show the distribution of the number of option ranges over different pair.

In [191]:
df_multi = optiondata.set_index(['settledate','todaydate','optionstrike', 'optiontype'])
df1 = optiondata.groupby(['settledate','todaydate']).count()
print df_multi.head(2)
df1['count'] = df1.daystosettle
df1 = df1.drop(df1.columns[range(13)], axis=1)
df1.head(2)
                                               daystosettle  today_sp_price  \
settledate todaydate  optionstrike optiontype                                 
2013-07-26 2013-07-10 1580         put                   16         1652.62   
                      1650         call                  16         1652.62   

                                               settle_sp_price  \
settledate todaydate  optionstrike optiontype                    
2013-07-26 2013-07-10 1580         put                 1690.47   
                      1650         call                1690.47   

                                               optionclosingprice  \
settledate todaydate  optionstrike optiontype                       
2013-07-26 2013-07-10 1580         put                      2.325   
                      1650         call                    19.200   

                                               optionhighprice  \
settledate todaydate  optionstrike optiontype                    
2013-07-26 2013-07-10 1580         put                     2.5   
                      1650         call                   19.8   

                                               optionlowprice  optionvol  \
settledate todaydate  optionstrike optiontype                              
2013-07-26 2013-07-10 1580         put                   2.15        107   
                      1650         call                 18.60       2009   

                                               optionopenint  range40  \
settledate todaydate  optionstrike optiontype                           
2013-07-26 2013-07-10 1580         put                  6316        1   
                      1650         call                 5446        1   

                                               prevPrice1d  prevVol1d  
settledate todaydate  optionstrike optiontype                          
2013-07-26 2013-07-10 1580         put             1652.32        128  
                      1650         call            1652.32       4278  
Out[191]:
count
settledate todaydate
2013-07-26 2013-07-10 2
2013-07-11 2
In [192]:
num = df1.values
print df1.shape
print df1.describe()
plt.plot(num)
plt.title("option number distribution")
plt.xlabel("Pair")
plt.ylabel("Frequency")
fig = plt.gcf()
(2191, 1)
             count
count  2191.000000
mean     21.475126
std      17.994728
min       1.000000
25%      10.000000
50%      17.000000
75%      28.000000
max     126.000000

From the above graph we can see that the number of options belongs to each pair changes quite a bit. Further, we are only interested in the options that has a strike price within 100 of today's sp500 price.

In [137]:
within100 = optiondata[abs(optiondata.today_sp_price - optiondata.optionstrike)<100]
df2 =  within100.groupby(['settledate','todaydate']).count()
df2['count'] = df2.daystosettle
df2 = df2.drop(df2.columns[range(13)], axis=1)
In [138]:
num2 = df2.values
print df2.shape
print df2.describe()
plt.plot(num2)
plt.title("option_within100 number distribution")
plt.xlabel("Pair")
plt.ylabel("Frequency")

fig = plt.gcf()
(2169, 1)
             count
count  2169.000000
mean     19.612725
std      16.181322
min       1.000000
25%       9.000000
50%      16.000000
75%      26.000000
max     115.000000
In [ ]:
# Group options into three category: in the money(1), at the money(0) and out of money(-1) 
In [7]:
def inthemoney(row):
    if (row['today_sp_price'] - row['optionstrike']) <= 0:
        return 0
    elif (row['optiontype']== 'call' and row['today_sp_price'] - row['optionstrike']) > 0:
        return 1
    elif (row['optiontype']== 'put' and row['today_sp_price'] - row['optionstrike']) < 0:
        return 1
    else:
        return -1
df_money = optiondata
df_money['in_the_money'] = optiondata.apply(lambda row: inthemoney(row), axis=1)
In [8]:
df3 = df_money.groupby(['settledate','todaydate','in_the_money','optiontype']).count()
df3['count'] = df3.daystosettle
df3 = df3.drop(df3.columns[range(12)], axis=1)

#df_multi3 = df_multi2.set_index(['settledate','todaydate', 'in_the_money', 'optiontype'])

Fourth Week

In [9]:
print df_money.shape
putoptions= df_money[df_money.optiontype == 'put']
print "number of put optionis " + str(putoptions.shape[0])
calloptions= df_money[df_money.optiontype == 'call']
print "number of call optionis " + str(calloptions.shape[0])
inMoneyPut = putoptions[putoptions.in_the_money == 1]
print "number of in the money put option is " + str(inMoneyPut.shape[0])
atMoneyPut = putoptions[putoptions.in_the_money == 0]
print "number of at the money put option is " + str(atMoneyPut.shape[0])
outMoneyPut = putoptions[putoptions.in_the_money == -1]
print "number of out the money put option is " + str(outMoneyPut.shape[0])
inMoneyCall = calloptions[calloptions.in_the_money == 1]
print "number of in the money call option is " + str(inMoneyCall.shape[0])
atMoneyCall = calloptions[calloptions.in_the_money == 0]
print "number of at the money call option is " + str(atMoneyCall.shape[0])
outMoneyCall = calloptions[calloptions.in_the_money == -1]
print "number of out the money call option is " + str(outMoneyCall.shape[0])
(47052, 16)
number of put optionis 21433
number of call optionis 25619
number of in the money put option is 0
number of at the money put option is 3718
number of out the money put option is 17715
number of in the money call option is 8188
number of at the money call option is 17431
number of out the money call option is 0
In [304]:
def inthemoney(row):
    if (row['today_sp_price'] - row['optionstrike']) == 0:
        return 0
    elif (row['optiontype']== 'call' and row['today_sp_price'] - row['optionstrike']) > 0:
        return 1
    elif (row['optiontype']== 'put' and row['today_sp_price'] - row['optionstrike']) < 0:
        return 1
    else:
        return -1
df_money = optiondata
df_money['in_the_money'] = optiondata.apply(lambda row: inthemoney(row), axis=1)
df3 = df_money.groupby(['settledate','todaydate','in_the_money','optiontype']).count()
df3['count'] = df3.daystosettle
df3 = df3.drop(df3.columns[range(12)], axis=1)
In [194]:
print df_money.shape
putoptions= df_money[df_money.optiontype == 'put']
print "number of put optionis " + str(putoptions.shape[0])
calloptions= df_money[df_money.optiontype == 'call']
print "number of call optionis " + str(calloptions.shape[0])
inMoneyPut = putoptions[putoptions.in_the_money == 1]
print "number of in the money put option is " + str(inMoneyPut.shape[0])
atMoneyPut = putoptions[putoptions.in_the_money == 0]
print "number of at the money put option is " + str(atMoneyPut.shape[0])
outMoneyPut = putoptions[putoptions.in_the_money == -1]
print "number of out the money put option is " + str(outMoneyPut.shape[0])
inMoneyCall = calloptions[calloptions.in_the_money == 1]
print "number of in the money call option is " + str(inMoneyCall.shape[0])
atMoneyCall = calloptions[calloptions.in_the_money == 0]
print "number of at the money call option is " + str(atMoneyCall.shape[0])
outMoneyCall = calloptions[calloptions.in_the_money == -1]
print "number of out the money call option is " + str(outMoneyCall.shape[0])
(47052, 16)
number of put optionis 21433
number of call optionis 25619
number of in the money put option is 3714
number of at the money put option is 4
number of out the money put option is 17715
number of in the money call option is 8188
number of at the money call option is 4
number of out the money call option is 17427

Next, we are going to put the first 20 options that has strike price closest to today's sp500 corresponding to the same (today date, settlement date) in one row according to following steps:

  1. sort data according to their absolute value of (strike price - today's sp500 price)
  2. create a dictionary, whose key is (today date, settlement date) pair and value is a list of our interested feature of each option.(suppose m vecor for each element)
  3. add each option in the dictionary.
  4. adjust each dict value to include 20 options (222 features for each)
  5. sort data accoridng to settledate and if settledate are the same, sort the data based on today's date.
  6. save data to file
In [305]:
def getOptionType(row):
    if row['optiontype']=='put':
        return 0
    else: 
        return 1
def changeType(df):
    df['type'] = df.apply(lambda row: getOptionType(row), axis = 1 )
    df['optiontype']=df['type']
    


def diffStrikeSp500(row):
    return abs(row['optionstrike']-row['today_sp_price'])
def sortAbs(df):
    df['abs'] = df.apply(lambda row: diffStrikeSp500(row), axis = 1 )
    return df.sort_values(by = 'abs') 

changeType(df_money)
df_money = sortAbs(df_money)
Out[305]:
daystosettle today_sp_price settle_sp_price optiontype optionstrike optionclosingprice optionhighprice optionlowprice optionvol optionopenint range40 prevPrice1d prevVol1d in_the_money abs type
count 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000 47052.000000
mean 8.406040 2051.972074 2058.949393 0.544483 2043.392842 14.322393 14.756049 13.775923 1893.833142 15073.351781 0.730872 2050.939268 1650.012157 -0.493922 50.113837 0.544483
std 4.845696 127.423588 128.656308 0.498023 138.079687 22.318629 22.789052 21.832394 3526.410195 16111.830875 0.443511 127.396609 3128.967583 0.869418 32.844691 0.498023
min 2.000000 1630.480000 1632.970000 0.000000 1520.000000 0.025000 0.000000 0.000000 0.000000 5001.000000 0.000000 1630.480000 0.000000 -1.000000 0.000000 0.000000
25% 4.000000 1982.300000 1982.850000 0.000000 1970.000000 0.750000 0.900000 0.600000 0.000000 7128.000000 0.000000 1979.260000 0.000000 -1.000000 21.930000 0.000000
50% 8.000000 2077.570000 2077.570000 1.000000 2050.000000 4.200000 4.500000 3.900000 363.000000 9945.500000 1.000000 2076.780000 302.500000 -1.000000 45.940000 1.000000
75% 12.000000 2120.790000 2126.410000 1.000000 2135.000000 17.600000 18.100000 16.900000 2319.250000 15761.250000 1.000000 2120.790000 1946.250000 1.000000 75.850000 1.000000
max 18.000000 2369.750000 2381.920000 1.000000 2455.000000 141.300000 147.700000 134.900000 46046.000000 177493.000000 1.000000 2369.750000 44570.000000 1.000000 119.990000 1.000000
In [307]:
def addToDict(row, optiondict):
    if optiondict.has_key((row['todaydate'],row['settledate'])):
        if len(optiondict[(row['todaydate'],row['settledate'])]) != 20:
            optiondict[(row['todaydate'],row['settledate'])].extend(
            [row['optiontype'],row['optionstrike'],
            row['optionclosingprice'],row['optionhighprice'], row['optionlowprice'],row['optionvol'], row['optionopenint'], 
                row['prevPrice1d'],row['prevVol1d'], row['in_the_money']])
            #optiondict[(row['todaydate'],row['settledate'])]= temp
    else:
        optiondict[(row['todaydate'],row['settledate'])] = [row['daystosettle'],row['settle_sp_price'],row['today_sp_price'],row['optiontype'],row['optionstrike'],
        row['optionclosingprice'],row['optionhighprice'], row['optionlowprice'],row['optionvol'], row['optionopenint'], 
                row['prevPrice1d'],row['prevVol1d'], row['in_the_money']]

def createDic(df):
    myDic ={}
    df.apply(lambda row: addToDict(row, myDic), axis=1)
    return myDic


myDic = createDic(df_money)  
In [308]:
def adjustDic(myDict):
    for k, v in myDict.items():
        if len(v) < 203:
            #print len(myDict[k])
            del myDict[k]
        else:
            myDict[k] = myDict[k][:203]
    
adjustDic(myDic)
df3=pd.DataFrame.from_dict(myDic)
df4 = df3.T
df4.reset_index(inplace=True)
df5 = df4.rename(columns={'level_1': 'settledate', 'level_0': 'todaydate'})
df5.to_csv("combinedoption", index=False)
In [315]:
df5
Out[315]:
todaydate settledate 0 1 2 3 4 5 6 7 ... 194 195 196 197 198 199 200 201 202 range40
0 2013-07-23 2013-07-26 3.0 1690.47 1692.39 1.0 1690.0 8.10 8.5 7.7 ... 1590.0 0.075 0.10 0.05 389.0 9532.0 1695.53 216.0 -1.0 1
1 2013-07-24 2013-07-26 2.0 1690.47 1685.94 1.0 1690.0 5.00 5.3 4.7 ... 1610.0 0.075 0.10 0.05 3508.0 12608.0 1692.39 3268.0 -1.0 1
2 2013-07-31 2013-08-02 2.0 1709.67 1687.37 0.0 1690.0 10.65 10.9 10.4 ... 1575.0 0.025 0.05 0.00 7114.0 8003.0 1685.96 194.0 -1.0 1
3 2013-08-20 2013-08-23 3.0 1663.50 1652.35 0.0 1650.0 7.15 7.5 6.8 ... 1540.0 0.075 0.10 0.05 543.0 6263.0 1646.06 1128.0 -1.0 1
4 2013-08-21 2013-08-23 2.0 1663.50 1642.80 0.0 1640.0 8.10 8.6 7.6 ... 1750.0 0.025 0.05 0.00 0.0 9774.0 1652.35 30.0 -1.0 1
5 2013-08-23 2013-08-30 7.0 1632.97 1663.50 0.0 1660.0 8.05 8.4 7.7 ... 1575.0 0.325 0.35 0.30 2845.0 7403.0 1656.96 1612.0 -1.0 1
6 2013-08-26 2013-08-30 4.0 1632.97 1656.78 0.0 1660.0 10.85 11.2 10.5 ... 1725.0 0.025 0.05 0.00 510.0 6463.0 1663.50 1508.0 -1.0 1
7 2013-08-27 2013-08-30 3.0 1632.97 1630.48 0.0 1635.0 12.05 12.5 11.6 ... 1700.0 0.075 0.10 0.05 2142.0 21310.0 1656.78 5095.0 -1.0 1
8 2013-08-28 2013-08-30 2.0 1632.97 1634.96 0.0 1640.0 9.70 9.9 9.5 ... 1565.0 0.175 0.20 0.15 452.0 7167.0 1630.48 1327.0 -1.0 1
9 2013-09-04 2013-09-06 2.0 1655.17 1653.08 1.0 1650.0 9.85 10.3 9.4 ... 1550.0 0.075 0.10 0.05 1328.0 6418.0 1639.77 1230.0 -1.0 1
10 2013-09-09 2013-09-13 4.0 1687.99 1671.71 1.0 1675.0 6.55 6.9 6.2 ... 1565.0 0.125 0.15 0.10 165.0 8298.0 1655.17 1839.0 -1.0 1
11 2013-09-10 2013-09-13 3.0 1687.99 1683.99 1.0 1685.0 6.00 6.3 5.7 ... 1600.0 0.125 0.15 0.10 466.0 14964.0 1671.71 2711.0 -1.0 1
12 2013-09-26 2013-09-30 4.0 1681.55 1698.67 1.0 1700.0 6.25 7.0 5.5 ... 1610.0 0.175 0.20 0.15 83.0 9264.0 1692.77 140.0 -1.0 1
13 2013-10-16 2013-10-25 9.0 1759.77 1721.54 1.0 1720.0 13.30 13.7 12.9 ... 1610.0 1.325 1.60 1.05 276.0 5601.0 1698.06 466.0 -1.0 1
14 2013-10-23 2013-10-25 2.0 1759.77 1746.38 1.0 1745.0 6.85 7.3 6.4 ... 1670.0 0.200 0.30 0.10 2671.0 7996.0 1754.67 182.0 -1.0 1
15 2013-11-05 2013-11-08 3.0 1770.61 1762.97 1.0 1770.0 3.30 3.5 3.1 ... 1700.0 0.300 0.35 0.25 3196.0 16548.0 1767.93 4276.0 -1.0 1
16 2013-11-20 2013-11-22 2.0 1804.76 1781.37 0.0 1780.0 5.20 5.4 5.0 ... 1900.0 0.025 0.05 0.00 0.0 7693.0 1787.87 0.0 -1.0 1
17 2013-11-27 2013-11-29 2.0 1805.81 1807.23 1.0 1805.0 4.20 4.4 4.0 ... 1725.0 0.125 0.15 0.10 1176.0 7737.0 1802.75 296.0 -1.0 1
18 2013-12-03 2013-12-06 3.0 1805.09 1795.15 1.0 1795.0 7.00 7.2 6.8 ... 1875.0 0.050 0.10 0.00 198.0 5582.0 1800.90 831.0 -1.0 1
19 2013-12-04 2013-12-06 2.0 1805.09 1792.81 1.0 1795.0 6.55 6.9 6.2 ... 1840.0 0.075 0.10 0.05 403.0 6381.0 1795.15 2614.0 -1.0 1
20 2013-12-11 2013-12-13 2.0 1775.32 1782.22 1.0 1785.0 5.15 5.5 4.8 ... 1710.0 0.500 0.60 0.40 417.0 6759.0 1802.62 1426.0 -1.0 1
21 2013-12-13 2013-12-31 18.0 1848.36 1775.32 1.0 1775.0 20.55 21.3 19.8 ... 1875.0 0.425 0.70 0.15 1281.0 49351.0 1775.50 2205.0 -1.0 0
22 2013-12-16 2013-12-31 15.0 1848.36 1786.54 1.0 1780.0 23.20 23.9 22.5 ... 1875.0 0.325 0.55 0.10 33.0 49151.0 1775.32 1281.0 -1.0 0
23 2013-12-17 2013-12-31 14.0 1848.36 1781.00 0.0 1780.0 20.25 21.0 19.5 ... 1700.0 82.200 83.70 80.70 0.0 13987.0 1786.54 0.0 1.0 0
24 2013-12-18 2013-12-31 13.0 1848.36 1810.65 1.0 1820.0 10.40 11.2 9.6 ... 1725.0 1.500 1.90 1.10 3909.0 33267.0 1781.00 5040.0 -1.0 1
25 2013-12-19 2013-12-31 12.0 1848.36 1809.60 0.0 1800.0 10.45 11.0 9.9 ... 1735.0 1.750 2.20 1.30 2226.0 8774.0 1810.65 7722.0 -1.0 1
26 2013-12-20 2013-12-31 11.0 1848.36 1818.32 1.0 1820.0 12.25 12.9 11.6 ... 1750.0 1.925 2.40 1.45 7478.0 33233.0 1809.60 1178.0 -1.0 1
27 2013-12-23 2013-12-31 8.0 1848.36 1827.99 1.0 1830.0 8.50 9.2 7.8 ... 1900.0 0.250 0.40 0.10 785.0 48796.0 1818.32 9606.0 -1.0 1
28 2013-12-24 2013-12-31 7.0 1848.36 1833.32 1.0 1835.0 8.10 11.1 5.1 ... 1900.0 0.175 0.25 0.10 658.0 48794.0 1827.99 785.0 -1.0 1
29 2013-12-26 2013-12-31 5.0 1848.36 1842.02 1.0 1840.0 8.30 9.6 7.0 ... 1770.0 0.325 0.50 0.15 13.0 8001.0 1833.32 4496.0 -1.0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
897 2017-01-19 2017-01-27 8.0 2296.68 2271.89 1.0 2270.0 11.10 11.6 10.6 ... 2165.0 1.275 1.45 1.10 0.0 5652.0 2267.89 0.0 -1.0 1
898 2017-01-20 2017-01-27 7.0 2296.68 2263.69 0.0 2265.0 8.30 8.6 8.0 ... 2175.0 1.025 1.10 0.95 0.0 8832.0 2271.89 0.0 -1.0 1
899 2017-01-23 2017-01-27 4.0 2296.68 2271.31 1.0 2270.0 7.50 8.0 7.0 ... 2175.0 0.500 0.65 0.35 5.0 9084.0 2263.69 0.0 -1.0 1
900 2017-01-24 2017-01-27 3.0 2296.68 2265.20 0.0 2265.0 7.15 7.4 6.9 ... 2180.0 0.425 0.50 0.35 0.0 8479.0 2271.31 0.0 -1.0 1
901 2017-01-25 2017-01-27 2.0 2296.68 2293.47 1.0 2295.0 5.05 5.3 4.8 ... 2215.0 0.125 0.15 0.10 353.0 8989.0 2265.20 0.0 -1.0 1
902 2017-01-26 2017-02-03 8.0 2280.85 2298.37 1.0 2300.0 10.75 11.1 10.4 ... 2195.0 0.675 0.80 0.55 0.0 5299.0 2293.47 1662.0 -1.0 1
903 2017-01-30 2017-02-03 4.0 2280.85 2294.69 1.0 2295.0 5.45 5.7 5.2 ... 2195.0 0.500 0.60 0.40 0.0 7796.0 2296.68 0.0 -1.0 1
904 2017-01-31 2017-02-03 3.0 2280.85 2280.90 1.0 2275.0 10.80 11.2 10.4 ... 2200.0 0.850 1.00 0.70 0.0 10774.0 2294.69 0.0 -1.0 1
905 2017-01-31 2017-02-17 17.0 2347.22 2280.90 1.0 2280.0 16.55 17.0 16.1 ... 2325.0 2.425 2.55 2.30 0.0 47777.0 2294.69 0.0 -1.0 0
906 2017-02-01 2017-02-03 2.0 2280.85 2278.87 0.0 2275.0 4.00 4.3 3.7 ... 2210.0 0.475 0.65 0.30 0.0 6788.0 2280.90 0.0 -1.0 1
907 2017-02-01 2017-02-17 16.0 2347.22 2278.87 1.0 2280.0 18.45 18.8 18.1 ... 2325.0 2.425 2.60 2.25 0.0 49754.0 2280.90 0.0 -1.0 0
908 2017-02-02 2017-02-17 15.0 2347.22 2279.55 1.0 2280.0 12.55 12.9 12.2 ... 2240.0 9.000 9.30 8.70 0.0 9783.0 2278.87 0.0 -1.0 0
909 2017-02-03 2017-02-10 7.0 2307.87 2280.85 1.0 2280.0 12.20 13.1 11.3 ... 2180.0 0.925 1.15 0.70 0.0 9882.0 2279.55 0.0 -1.0 1
910 2017-02-03 2017-02-17 14.0 2347.22 2280.85 1.0 2280.0 16.30 16.7 15.9 ... 2325.0 1.525 1.60 1.45 0.0 54699.0 2279.55 0.0 -1.0 0
911 2017-02-06 2017-02-10 4.0 2307.87 2291.70 1.0 2290.0 7.55 7.7 7.4 ... 2180.0 0.300 0.35 0.25 9.0 9995.0 2280.85 0.0 -1.0 1
912 2017-02-06 2017-02-17 11.0 2347.22 2291.70 1.0 2290.0 11.70 12.2 11.2 ... 2330.0 1.125 1.35 0.90 533.0 24263.0 2280.85 0.0 -1.0 0
913 2017-02-07 2017-02-10 3.0 2307.87 2292.56 1.0 2290.0 11.15 11.5 10.8 ... 2215.0 0.500 0.60 0.40 0.0 5350.0 2291.70 177.0 -1.0 1
914 2017-02-07 2017-02-17 10.0 2347.22 2292.56 1.0 2290.0 15.10 15.5 14.7 ... 2335.0 0.925 1.05 0.80 0.0 7786.0 2291.70 159.0 -1.0 0
915 2017-02-08 2017-02-10 2.0 2307.87 2293.08 1.0 2290.0 7.05 7.5 6.6 ... 2220.0 0.300 0.45 0.15 0.0 8730.0 2292.56 0.0 -1.0 1
916 2017-02-08 2017-02-17 9.0 2347.22 2293.08 0.0 2290.0 10.30 10.7 9.9 ... 2335.0 0.575 0.70 0.45 0.0 7786.0 2292.56 0.0 -1.0 0
917 2017-02-09 2017-02-17 8.0 2347.22 2294.67 1.0 2290.0 14.75 15.4 14.1 ... 2260.0 2.350 2.50 2.20 0.0 17485.0 2293.08 0.0 -1.0 0
918 2017-02-10 2017-02-17 7.0 2347.22 2307.87 1.0 2310.0 8.35 8.7 8.0 ... 2270.0 1.425 1.55 1.30 0.0 12612.0 2294.67 0.0 -1.0 1
919 2017-02-13 2017-02-17 4.0 2347.22 2316.10 1.0 2315.0 10.15 10.6 9.7 ... 2275.0 45.000 46.30 43.70 0.0 38522.0 2307.87 4.0 1.0 1
920 2017-02-14 2017-02-17 3.0 2347.22 2328.25 1.0 2330.0 5.75 6.3 5.2 ... 2280.0 47.600 48.90 46.30 0.0 9636.0 2316.10 0.0 1.0 1
921 2017-02-15 2017-02-17 2.0 2347.22 2343.44 1.0 2345.0 4.60 5.3 3.9 ... 2390.0 0.150 0.30 0.00 10.0 7795.0 2328.25 0.0 -1.0 1
922 2017-02-21 2017-02-24 3.0 2363.81 2351.16 1.0 2350.0 10.50 10.9 10.1 ... 2435.0 0.100 0.20 0.00 0.0 5537.0 2347.22 0.0 -1.0 1
923 2017-02-21 2017-02-28 7.0 2369.75 2351.16 1.0 2350.0 13.45 14.0 12.9 ... 2250.0 0.500 0.65 0.35 0.0 9858.0 2347.22 0.0 -1.0 1
924 2017-02-22 2017-02-24 2.0 2363.81 2365.38 1.0 2365.0 4.20 4.6 3.8 ... 2275.0 0.225 0.35 0.10 0.0 5362.0 2351.16 0.0 -1.0 1
925 2017-02-28 2017-03-03 3.0 2381.92 2369.75 1.0 2370.0 9.05 9.5 8.6 ... 2300.0 69.150 70.50 67.80 0.0 14508.0 2367.34 0.0 1.0 1
926 2017-03-01 2017-03-03 2.0 2381.92 2363.64 0.0 2360.0 2.75 3.0 2.5 ... 2425.0 0.150 0.30 0.00 0.0 6850.0 2369.75 0.0 -1.0 1

927 rows × 206 columns

Training Part Part I: Random Forest Classsification

1. we divide trainig set and test set based on settlement date. If a option's settledate is after the end of          2015,  we use it as test data; otherwise, training data.
2. Target: whether settlement day's sp500 price is within 40 of today's sp500 price
      X: ['optiontype','optionstrike',
     'optionclosingprice','optionhighprice', 'optionlowprice','optionvol', 'optionopenint', 
      'prevPrice1d','prevVol1d', 'in_the_money'] for each option and ['daystosettle', 'today_sp_price']                 for all options belong to the same (settledate, toadaydate) pair. 
3. Train a random forest classifier
In [309]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
In [310]:
def within40(row):
    if abs(row[1] - row[2]) < 40:
        return 1
    else:
        return 0

def addTarget40(df):
    """ if sp500 price is within 40 of today's price, class 1, otherwise 0
    """
    range40 = df.apply(lambda row: within40(row), axis=1)
    df['range40'] = range40
    fig = plt.figure()
    fig.suptitle('Distribution of sp500 price difference within range40', fontsize=14, style='italic')
    df['range40'].hist()
    plt.show()
    #plt.savefig('/Users/mengranwang/desktop/range40.png')

addTarget40(df5)
In [312]:
def train(df):
    mask1 = (df['settledate'] < '2015-6-31')
    training= df.loc[mask1]
    mask2 = (df['settledate'] > '2015-6-31')
    test= df.loc[mask2]

    col_list = list(df)
    col_list[4:205]
    x_list=[0]
    x_list.extend(col_list[4:205])

    X_test = test.as_matrix(x_list)
    y_test = test.as_matrix(['range40'])[:,0]

    X_train = training.as_matrix(x_list)
    y_train = training.as_matrix(['range40'])[:,0]
    
    regres = RandomForestClassifier(n_estimators=1000, max_features="sqrt", n_jobs=-1, oob_score=True)
    regres.fit(X_train, y_train)
    pred = regres.predict(X_test)
    pred_train = regres.predict(X_train)
    print "oobscore", regres.oob_score_
    pscore = metrics.accuracy_score(y_test, pred)
    pscore_train = metrics.accuracy_score(y_train, pred_train)
    print "Test accuracy: ", pscore
    print "Training accuracy: ", pscore_train
    cf = confusion_matrix(y_test, pred, labels=[1, 0])
    print pd.DataFrame.from_items([('True: 1', cf[0]), ('True: 0', cf[1])], orient='index', columns=['Predicted: 1', 'Predicted: 0'])

train(df5)
oobscore 0.752293577982
Test accuracy:  0.759162303665
Training accuracy:  1.0
         Predicted: 1  Predicted: 0
True: 1           267            18
True: 0            74            23