Analyzing Financial Time Series Using BigQuery and Cloud Datalab

SATYAJIT MAITRA
Jul 25 · 18 min read

Objectives

Use Cloud Shell

datalab create instance-name
!gsutil cp gs://solutions-public-assets/bigquery-datalab/*

Set up

import pandas as pd
from pandas.tools.plotting import autocorrelation_plot
from pandas.tools.plotting import scatter_matrix
import numpy as npimport matplotlib.pyplot as pltimport google.datalab.bigquery as bq #serverless data warehouseimport tensorflow as tf #machine learning
snp = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.snp'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
nyse = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.nyse'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
djia = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.djia'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
nikkei = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.nikkei'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
hangseng = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.hangseng'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
ftse = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.ftse'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
dax = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.dax'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
aord = bq.Query.from_table(bq.Table('bingo-ml-1.market_data.aord'), fields=['Date', 'Close']).execute().result().to_dataframe().set_index('Date')
closing_data = pd.DataFrame()closing_data['snp_close'] = snp['Close']
closing_data['nyse_close'] = nyse['Close']
closing_data['djia_close'] = djia['Close']
closing_data['nikkei_close'] = nikkei['Close']
closing_data['hangseng_close'] = hangseng['Close']
closing_data['ftse_close'] = ftse['Close']
closing_data['dax_close'] = dax['Close']
closing_data['aord_close'] = aord['Close']
# Pandas includes a very convenient function for filling gaps in the data.
closing_data = closing_data.fillna(method='ffill')

Exploratory data analysis

closing_data.describe()
pd.concat([closing_data['snp_close'],
closing_data['nyse_close'],
closing_data['djia_close'],
closing_data['nikkei_close'],
closing_data['hangseng_close'],
closing_data['ftse_close'],
closing_data['dax_close'],
closing_data['aord_close']], axis=1).plot(figsize=(20, 15))
closing_data['snp_close_scaled'] = closing_data['snp_close'] / max(closing_data['snp_close'])
closing_data['nyse_close_scaled'] = closing_data['nyse_close'] / max(closing_data['nyse_close'])
closing_data['djia_close_scaled'] = closing_data['djia_close'] / max(closing_data['djia_close'])
closing_data['nikkei_close_scaled'] = closing_data['nikkei_close'] / max(closing_data['nikkei_close'])
closing_data['hangseng_close_scaled'] = closing_data['hangseng_close'] / max(closing_data['hangseng_close'])
closing_data['ftse_close_scaled'] = closing_data['ftse_close'] / max(closing_data['ftse_close'])
closing_data['dax_close_scaled'] = closing_data['dax_close'] / max(closing_data['dax_close'])
closing_data['aord_close_scaled'] = closing_data['aord_close'] / max(closing_data['aord_close'])
pd.concat([closing_data['snp_close_scaled'],
closing_data['nyse_close_scaled'],
closing_data['djia_close_scaled'],
closing_data['nikkei_close_scaled'],
closing_data['hangseng_close_scaled'],
closing_data['ftse_close_scaled'],
closing_data['dax_close_scaled'],
closing_data['aord_close_scaled']], axis=1).plot(figsize=(20, 15))
fig = plt.figure()
fig.set_figwidth(20)
fig.set_figheight(15)
_ = autocorrelation_plot(closing_data['snp_close'], label='snp_close')
_ = autocorrelation_plot(closing_data['nyse_close'], label='nyse_close')
_ = autocorrelation_plot(closing_data['djia_close'], label='djia_close')
_ = autocorrelation_plot(closing_data['nikkei_close'], label='nikkei_close')
_ = autocorrelation_plot(closing_data['hangseng_close'], label='hangseng_close')
_ = autocorrelation_plot(closing_data['ftse_close'], label='ftse_close')
_ = autocorrelation_plot(closing_data['dax_close'], label='dax_close')
_ = autocorrelation_plot(closing_data['aord_close'], label='aord_close')
_ = plt.legend(loc='upper right')
_ = scatter_matrix(pd.concat([closing_data['snp_close_scaled'],
closing_data['nyse_close_scaled'],
closing_data['djia_close_scaled'],
closing_data['nikkei_close_scaled'],
closing_data['hangseng_close_scaled'],
closing_data['ftse_close_scaled'],
closing_data['dax_close_scaled'],
closing_data['aord_close_scaled']], axis=1), figsize=(20, 20), diagonal='kde')
log_return_data = pd.DataFrame()log_return_data['snp_log_return'] = np.log(closing_data['snp_close']/closing_data['snp_close'].shift())
log_return_data['nyse_log_return'] = np.log(closing_data['nyse_close']/closing_data['nyse_close'].shift())
log_return_data['djia_log_return'] = np.log(closing_data['djia_close']/closing_data['djia_close'].shift())
log_return_data['nikkei_log_return'] = np.log(closing_data['nikkei_close']/closing_data['nikkei_close'].shift())
log_return_data['hangseng_log_return'] = np.log(closing_data['hangseng_close']/closing_data['hangseng_close'].shift())
log_return_data['ftse_log_return'] = np.log(closing_data['ftse_close']/closing_data['ftse_close'].shift())
log_return_data['dax_log_return'] = np.log(closing_data['dax_close']/closing_data['dax_close'].shift())
log_return_data['aord_log_return'] = np.log(closing_data['aord_close']/closing_data['aord_close'].shift())
log_return_data.describe()_ = pd.concat([log_return_data['snp_log_return'],
log_return_data['nyse_log_return'],
log_return_data['djia_log_return'],
log_return_data['nikkei_log_return'],
log_return_data['hangseng_log_return'],
log_return_data['ftse_log_return'],
log_return_data['dax_log_return'],
log_return_data['aord_log_return']], axis=1).plot(figsize=(20, 15))
fig = plt.figure()
fig.set_figwidth(20)
fig.set_figheight(15)
_ = autocorrelation_plot(log_return_data['snp_log_return'], label='snp_log_return')
_ = autocorrelation_plot(log_return_data['nyse_log_return'], label='nyse_log_return')
_ = autocorrelation_plot(log_return_data['djia_log_return'], label='djia_log_return')
_ = autocorrelation_plot(log_return_data['nikkei_log_return'], label='nikkei_log_return')
_ = autocorrelation_plot(log_return_data['hangseng_log_return'], label='hangseng_log_return')
_ = autocorrelation_plot(log_return_data['ftse_log_return'], label='ftse_log_return')
_ = autocorrelation_plot(log_return_data['dax_log_return'], label='dax_log_return')
_ = autocorrelation_plot(log_return_data['aord_log_return'], label='aord_log_return')
_ = plt.legend(loc='upper right')

Feature selection

TensorFlow

Feature engineering for TensorFlow

log_return_data['snp_log_return_positive'] = 0
log_return_data.ix[log_return_data['snp_log_return'] >= 0, 'snp_log_return_positive'] = 1
log_return_data['snp_log_return_negative'] = 0
log_return_data.ix[log_return_data['snp_log_return'] < 0, 'snp_log_return_negative'] = 1
training_test_data = pd.DataFrame(
columns=[
'snp_log_return_positive', 'snp_log_return_negative',
'snp_log_return_1', 'snp_log_return_2', 'snp_log_return_3',
'nyse_log_return_1', 'nyse_log_return_2', 'nyse_log_return_3',
'djia_log_return_1', 'djia_log_return_2', 'djia_log_return_3',
'nikkei_log_return_0', 'nikkei_log_return_1', 'nikkei_log_return_2',
'hangseng_log_return_0', 'hangseng_log_return_1', 'hangseng_log_return_2',
'ftse_log_return_0', 'ftse_log_return_1', 'ftse_log_return_2',
'dax_log_return_0', 'dax_log_return_1', 'dax_log_return_2',
'aord_log_return_0', 'aord_log_return_1', 'aord_log_return_2'])
for i in range(7, len(log_return_data)):
snp_log_return_positive = log_return_data['snp_log_return_positive'].ix[i]
snp_log_return_negative = log_return_data['snp_log_return_negative'].ix[i]
snp_log_return_1 = log_return_data['snp_log_return'].ix[i-1]
snp_log_return_2 = log_return_data['snp_log_return'].ix[i-2]
snp_log_return_3 = log_return_data['snp_log_return'].ix[i-3]
nyse_log_return_1 = log_return_data['nyse_log_return'].ix[i-1]
nyse_log_return_2 = log_return_data['nyse_log_return'].ix[i-2]
nyse_log_return_3 = log_return_data['nyse_log_return'].ix[i-3]
djia_log_return_1 = log_return_data['djia_log_return'].ix[i-1]
djia_log_return_2 = log_return_data['djia_log_return'].ix[i-2]
djia_log_return_3 = log_return_data['djia_log_return'].ix[i-3]
nikkei_log_return_0 = log_return_data['nikkei_log_return'].ix[i]
nikkei_log_return_1 = log_return_data['nikkei_log_return'].ix[i-1]
nikkei_log_return_2 = log_return_data['nikkei_log_return'].ix[i-2]
hangseng_log_return_0 = log_return_data['hangseng_log_return'].ix[i]
hangseng_log_return_1 = log_return_data['hangseng_log_return'].ix[i-1]
hangseng_log_return_2 = log_return_data['hangseng_log_return'].ix[i-2]
ftse_log_return_0 = log_return_data['ftse_log_return'].ix[i]
ftse_log_return_1 = log_return_data['ftse_log_return'].ix[i-1]
ftse_log_return_2 = log_return_data['ftse_log_return'].ix[i-2]
dax_log_return_0 = log_return_data['dax_log_return'].ix[i]
dax_log_return_1 = log_return_data['dax_log_return'].ix[i-1]
dax_log_return_2 = log_return_data['dax_log_return'].ix[i-2]
aord_log_return_0 = log_return_data['aord_log_return'].ix[i]
aord_log_return_1 = log_return_data['aord_log_return'].ix[i-1]
aord_log_return_2 = log_return_data['aord_log_return'].ix[i-2]
training_test_data = training_test_data.append(
{'snp_log_return_positive':snp_log_return_positive,
'snp_log_return_negative':snp_log_return_negative,
'snp_log_return_1':snp_log_return_1,
'snp_log_return_2':snp_log_return_2,
'snp_log_return_3':snp_log_return_3,
'nyse_log_return_1':nyse_log_return_1,
'nyse_log_return_2':nyse_log_return_2,
'nyse_log_return_3':nyse_log_return_3,
'djia_log_return_1':djia_log_return_1,
'djia_log_return_2':djia_log_return_2,
'djia_log_return_3':djia_log_return_3,
'nikkei_log_return_0':nikkei_log_return_0,
'nikkei_log_return_1':nikkei_log_return_1,
'nikkei_log_return_2':nikkei_log_return_2,
'hangseng_log_return_0':hangseng_log_return_0,
'hangseng_log_return_1':hangseng_log_return_1,
'hangseng_log_return_2':hangseng_log_return_2,
'ftse_log_return_0':ftse_log_return_0,
'ftse_log_return_1':ftse_log_return_1,
'ftse_log_return_2':ftse_log_return_2,
'dax_log_return_0':dax_log_return_0,
'dax_log_return_1':dax_log_return_1,
'dax_log_return_2':dax_log_return_2,
'aord_log_return_0':aord_log_return_0,
'aord_log_return_1':aord_log_return_1,
'aord_log_return_2':aord_log_return_2},
ignore_index=True)

training_test_data.describe()
predictors_tf = training_test_data[training_test_data.columns[2:]]classes_tf = training_test_data[training_test_data.columns[:2]]training_set_size = int(len(training_test_data) * 0.8)
test_set_size = len(training_test_data) - training_set_size
training_predictors_tf = predictors_tf[:training_set_size]
training_classes_tf = classes_tf[:training_set_size]
test_predictors_tf = predictors_tf[training_set_size:]
test_classes_tf = classes_tf[training_set_size:]
training_predictors_tf.describe()
test_predictors_tf.describe()
def tf_confusion_metrics(model, actual_classes, session, feed_dict):
predictions = tf.argmax(model, 1)
actuals = tf.argmax(actual_classes, 1)
ones_like_actuals = tf.ones_like(actuals)
zeros_like_actuals = tf.zeros_like(actuals)
ones_like_predictions = tf.ones_like(predictions)
zeros_like_predictions = tf.zeros_like(predictions)
tp_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, ones_like_actuals),
tf.equal(predictions, ones_like_predictions)
),
"float"
)
)
tn_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, zeros_like_actuals),
tf.equal(predictions, zeros_like_predictions)
),
"float"
)
)
fp_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, zeros_like_actuals),
tf.equal(predictions, ones_like_predictions)
),
"float"
)
)
fn_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, ones_like_actuals),
tf.equal(predictions, zeros_like_predictions)
),
"float"
)
)
tp, tn, fp, fn = \
session.run(
[tp_op, tn_op, fp_op, fn_op],
feed_dict
)
tpfn = float(tp) + float(fn)
tpr = 0 if tpfn == 0 else float(tp)/tpfn
fpr = 0 if tpfn == 0 else float(fp)/tpfn
total = float(tp) + float(fp) + float(fn) + float(tn)
accuracy = 0 if total == 0 else (float(tp) + float(tn))/total
recall = tpr
tpfp = float(tp) + float(fp)
precision = 0 if tpfp == 0 else float(tp)/tpfp

f1_score = 0 if recall == 0 else (2 * (precision * recall)) / (precision + recall)

print('Precision = ', precision)
print('Recall = ', recall)
print('F1 Score = ', f1_score)
print('Accuracy = ', accuracy)

Binary classification with TensorFlow

sess = tf.Session()# Define variables for the number of predictors and number of classes to remove magic numbers from our code.
num_predictors = len(training_predictors_tf.columns) # 24 in the default case
num_classes = len(training_classes_tf.columns) # 2 in the default case
# Define placeholders for the data we feed into the process - feature data and actual classes.
feature_data = tf.placeholder("float", [None, num_predictors])
actual_classes = tf.placeholder("float", [None, num_classes])
# Define a matrix of weights and initialize it with some small random values.
weights = tf.Variable(tf.truncated_normal([num_predictors, num_classes], stddev=0.0001))
biases = tf.Variable(tf.ones([num_classes]))
# Define our model...
# Here we take a softmax regression of the product of our feature data and weights.
model = tf.nn.softmax(tf.matmul(feature_data, weights) + biases)
# Define a cost function (we're using the cross entropy).
cost = -tf.reduce_sum(actual_classes*tf.log(model))
# Define a training step...
# Here we use gradient descent with a learning rate of 0.01 using the cost function we just defined.
training_step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
init = tf.initialize_all_variables()
sess.run(init)
WARNING:tensorflow:From <ipython-input-25-cd5471ce00ea>:26: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(actual_classes, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
for i in range(1, 30001):
sess.run(
training_step,
feed_dict={
feature_data: training_predictors_tf.values,
actual_classes: training_classes_tf.values.reshape(len(training_classes_tf.values), 2)
}
)
if i%5000 == 0:
print(i, sess.run(
accuracy,
feed_dict={
feature_data: training_predictors_tf.values,
actual_classes: training_classes_tf.values.reshape(len(training_classes_tf.values), 2)
}
))
5000 0.809896
10000 0.859375
15000 0.881076
20000 0.891493
25000 0.896701
30000 0.904514
feed_dict= {
feature_data: test_predictors_tf.values,
actual_classes: test_classes_tf.values.reshape(len(test_classes_tf.values), 2)
}
tf_confusion_metrics(model, actual_classes, sess, feed_dict)Precision = 0.905660377358
Recall = 0.780487804878
F1 Score = 0.838427947598
Accuracy = 0.871527777778

Feed-forward neural network with two hidden layers

sess1 = tf.Session()num_predictors = len(training_predictors_tf.columns)
num_classes = len(training_classes_tf.columns)
feature_data = tf.placeholder("float", [None, num_predictors])
actual_classes = tf.placeholder("float", [None, 2])
weights1 = tf.Variable(tf.truncated_normal([24, 50], stddev=0.0001))
biases1 = tf.Variable(tf.ones([50]))
weights2 = tf.Variable(tf.truncated_normal([50, 25], stddev=0.0001))
biases2 = tf.Variable(tf.ones([25]))

weights3 = tf.Variable(tf.truncated_normal([25, 2], stddev=0.0001))
biases3 = tf.Variable(tf.ones([2]))
hidden_layer_1 = tf.nn.relu(tf.matmul(feature_data, weights1) + biases1)
hidden_layer_2 = tf.nn.relu(tf.matmul(hidden_layer_1, weights2) + biases2)
model = tf.nn.softmax(tf.matmul(hidden_layer_2, weights3) + biases3)
cost = -tf.reduce_sum(actual_classes*tf.log(model))train_op1 = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)init = tf.initialize_all_variables()
sess1.run(init)
WARNING:tensorflow:From <ipython-input-28-dd8f22a83d55>:26: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(actual_classes, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
for i in range(1, 30001):
sess1.run(
train_op1,
feed_dict={
feature_data: training_predictors_tf.values,
actual_classes: training_classes_tf.values.reshape(len(training_classes_tf.values), 2)
}
)
if i%5000 == 0:
print(i, sess1.run(
accuracy,
feed_dict={
feature_data: training_predictors_tf.values,
actual_classes: training_classes_tf.values.reshape(len(training_classes_tf.values), 2)
}
))
5000 0.931424
10000 0.934028
15000 0.934028
20000 0.934028
25000 0.934028
30000 0.934028
feed_dict= {
feature_data: test_predictors_tf.values,
actual_classes: test_classes_tf.values.reshape(len(test_classes_tf.values), 2)
}
tf_confusion_metrics(model, actual_classes, sess1, feed_dict)Precision = 0.921052631579
Recall = 0.853658536585
F1 Score = 0.886075949367
Accuracy = 0.90625

Conclusion

SATYAJIT MAITRA

Written by

Ml||start-up||data science||writer@TechiExpert||https://timewithai.com/

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch
Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore
Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade