%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
In the clean option file. We have removed duplicate values, and added sp500 price of previous day and option vol of previous day.
optiondata = pd.read_table("cleanoption", delimiter = ",")
# optiondata.isnull().any() "This shows no cell is empty"
print optiondata.head(5)
print optiondata.shape
Further, we want to group data using 'todaydate' and 'settledate'. So each row should be identified by unique pairs of (settledate, todaydate). df_multi is multi-index dataframe identified by ['settledate','todaydate','optionstrike', 'optiontype'], so each row correspinding to one specific option and we want to combine rows with same (settledate, todaydate) pairs to be our training and testing data. In df1, we count how many options belongs to one specific (settledate, todaydate) pair and draw a line to show the distribution of the number of option ranges over different pair.
df_multi = optiondata.set_index(['settledate','todaydate','optionstrike', 'optiontype'])
df1 = optiondata.groupby(['settledate','todaydate']).count()
print df_multi.head(2)
df1['count'] = df1.daystosettle
df1 = df1.drop(df1.columns[range(13)], axis=1)
df1.head(2)
num = df1.values
print df1.shape
print df1.describe()
plt.plot(num)
plt.title("option number distribution")
plt.xlabel("Pair")
plt.ylabel("Frequency")
fig = plt.gcf()
From the above graph we can see that the number of options belongs to each pair changes quite a bit. Further, we are only interested in the options that has a strike price within 100 of today's sp500 price.
within100 = optiondata[abs(optiondata.today_sp_price - optiondata.optionstrike)<100]
df2 = within100.groupby(['settledate','todaydate']).count()
df2['count'] = df2.daystosettle
df2 = df2.drop(df2.columns[range(13)], axis=1)
num2 = df2.values
print df2.shape
print df2.describe()
plt.plot(num2)
plt.title("option_within100 number distribution")
plt.xlabel("Pair")
plt.ylabel("Frequency")
fig = plt.gcf()
# Group options into three category: in the money(1), at the money(0) and out of money(-1)
def inthemoney(row):
if (row['today_sp_price'] - row['optionstrike']) <= 0:
return 0
elif (row['optiontype']== 'call' and row['today_sp_price'] - row['optionstrike']) > 0:
return 1
elif (row['optiontype']== 'put' and row['today_sp_price'] - row['optionstrike']) < 0:
return 1
else:
return -1
df_money = optiondata
df_money['in_the_money'] = optiondata.apply(lambda row: inthemoney(row), axis=1)
df3 = df_money.groupby(['settledate','todaydate','in_the_money','optiontype']).count()
df3['count'] = df3.daystosettle
df3 = df3.drop(df3.columns[range(12)], axis=1)
#df_multi3 = df_multi2.set_index(['settledate','todaydate', 'in_the_money', 'optiontype'])
print df_money.shape
putoptions= df_money[df_money.optiontype == 'put']
print "number of put optionis " + str(putoptions.shape[0])
calloptions= df_money[df_money.optiontype == 'call']
print "number of call optionis " + str(calloptions.shape[0])
inMoneyPut = putoptions[putoptions.in_the_money == 1]
print "number of in the money put option is " + str(inMoneyPut.shape[0])
atMoneyPut = putoptions[putoptions.in_the_money == 0]
print "number of at the money put option is " + str(atMoneyPut.shape[0])
outMoneyPut = putoptions[putoptions.in_the_money == -1]
print "number of out the money put option is " + str(outMoneyPut.shape[0])
inMoneyCall = calloptions[calloptions.in_the_money == 1]
print "number of in the money call option is " + str(inMoneyCall.shape[0])
atMoneyCall = calloptions[calloptions.in_the_money == 0]
print "number of at the money call option is " + str(atMoneyCall.shape[0])
outMoneyCall = calloptions[calloptions.in_the_money == -1]
print "number of out the money call option is " + str(outMoneyCall.shape[0])
def inthemoney(row):
if (row['today_sp_price'] - row['optionstrike']) == 0:
return 0
elif (row['optiontype']== 'call' and row['today_sp_price'] - row['optionstrike']) > 0:
return 1
elif (row['optiontype']== 'put' and row['today_sp_price'] - row['optionstrike']) < 0:
return 1
else:
return -1
df_money = optiondata
df_money['in_the_money'] = optiondata.apply(lambda row: inthemoney(row), axis=1)
df3 = df_money.groupby(['settledate','todaydate','in_the_money','optiontype']).count()
df3['count'] = df3.daystosettle
df3 = df3.drop(df3.columns[range(12)], axis=1)
print df_money.shape
putoptions= df_money[df_money.optiontype == 'put']
print "number of put optionis " + str(putoptions.shape[0])
calloptions= df_money[df_money.optiontype == 'call']
print "number of call optionis " + str(calloptions.shape[0])
inMoneyPut = putoptions[putoptions.in_the_money == 1]
print "number of in the money put option is " + str(inMoneyPut.shape[0])
atMoneyPut = putoptions[putoptions.in_the_money == 0]
print "number of at the money put option is " + str(atMoneyPut.shape[0])
outMoneyPut = putoptions[putoptions.in_the_money == -1]
print "number of out the money put option is " + str(outMoneyPut.shape[0])
inMoneyCall = calloptions[calloptions.in_the_money == 1]
print "number of in the money call option is " + str(inMoneyCall.shape[0])
atMoneyCall = calloptions[calloptions.in_the_money == 0]
print "number of at the money call option is " + str(atMoneyCall.shape[0])
outMoneyCall = calloptions[calloptions.in_the_money == -1]
print "number of out the money call option is " + str(outMoneyCall.shape[0])
Next, we are going to put the first 20 options that has strike price closest to today's sp500 corresponding to the same (today date, settlement date) in one row according to following steps:
def getOptionType(row):
if row['optiontype']=='put':
return 0
else:
return 1
def changeType(df):
df['type'] = df.apply(lambda row: getOptionType(row), axis = 1 )
df['optiontype']=df['type']
def diffStrikeSp500(row):
return abs(row['optionstrike']-row['today_sp_price'])
def sortAbs(df):
df['abs'] = df.apply(lambda row: diffStrikeSp500(row), axis = 1 )
return df.sort_values(by = 'abs')
changeType(df_money)
df_money = sortAbs(df_money)
def addToDict(row, optiondict):
if optiondict.has_key((row['todaydate'],row['settledate'])):
if len(optiondict[(row['todaydate'],row['settledate'])]) != 20:
optiondict[(row['todaydate'],row['settledate'])].extend(
[row['optiontype'],row['optionstrike'],
row['optionclosingprice'],row['optionhighprice'], row['optionlowprice'],row['optionvol'], row['optionopenint'],
row['prevPrice1d'],row['prevVol1d'], row['in_the_money']])
#optiondict[(row['todaydate'],row['settledate'])]= temp
else:
optiondict[(row['todaydate'],row['settledate'])] = [row['daystosettle'],row['settle_sp_price'],row['today_sp_price'],row['optiontype'],row['optionstrike'],
row['optionclosingprice'],row['optionhighprice'], row['optionlowprice'],row['optionvol'], row['optionopenint'],
row['prevPrice1d'],row['prevVol1d'], row['in_the_money']]
def createDic(df):
myDic ={}
df.apply(lambda row: addToDict(row, myDic), axis=1)
return myDic
myDic = createDic(df_money)
def adjustDic(myDict):
for k, v in myDict.items():
if len(v) < 203:
#print len(myDict[k])
del myDict[k]
else:
myDict[k] = myDict[k][:203]
adjustDic(myDic)
df3=pd.DataFrame.from_dict(myDic)
df4 = df3.T
df4.reset_index(inplace=True)
df5 = df4.rename(columns={'level_1': 'settledate', 'level_0': 'todaydate'})
df5.to_csv("combinedoption", index=False)
df5
Training Part Part I: Random Forest Classsification
1. we divide trainig set and test set based on settlement date. If a option's settledate is after the end of 2015, we use it as test data; otherwise, training data.
2. Target: whether settlement day's sp500 price is within 40 of today's sp500 price
X: ['optiontype','optionstrike',
'optionclosingprice','optionhighprice', 'optionlowprice','optionvol', 'optionopenint',
'prevPrice1d','prevVol1d', 'in_the_money'] for each option and ['daystosettle', 'today_sp_price'] for all options belong to the same (settledate, toadaydate) pair.
3. Train a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
def within40(row):
if abs(row[1] - row[2]) < 40:
return 1
else:
return 0
def addTarget40(df):
""" if sp500 price is within 40 of today's price, class 1, otherwise 0
"""
range40 = df.apply(lambda row: within40(row), axis=1)
df['range40'] = range40
fig = plt.figure()
fig.suptitle('Distribution of sp500 price difference within range40', fontsize=14, style='italic')
df['range40'].hist()
plt.show()
#plt.savefig('/Users/mengranwang/desktop/range40.png')
addTarget40(df5)
def train(df):
mask1 = (df['settledate'] < '2015-6-31')
training= df.loc[mask1]
mask2 = (df['settledate'] > '2015-6-31')
test= df.loc[mask2]
col_list = list(df)
col_list[4:205]
x_list=[0]
x_list.extend(col_list[4:205])
X_test = test.as_matrix(x_list)
y_test = test.as_matrix(['range40'])[:,0]
X_train = training.as_matrix(x_list)
y_train = training.as_matrix(['range40'])[:,0]
regres = RandomForestClassifier(n_estimators=1000, max_features="sqrt", n_jobs=-1, oob_score=True)
regres.fit(X_train, y_train)
pred = regres.predict(X_test)
pred_train = regres.predict(X_train)
print "oobscore", regres.oob_score_
pscore = metrics.accuracy_score(y_test, pred)
pscore_train = metrics.accuracy_score(y_train, pred_train)
print "Test accuracy: ", pscore
print "Training accuracy: ", pscore_train
cf = confusion_matrix(y_test, pred, labels=[1, 0])
print pd.DataFrame.from_items([('True: 1', cf[0]), ('True: 0', cf[1])], orient='index', columns=['Predicted: 1', 'Predicted: 0'])
train(df5)