Created
March 25, 2015 16:59
-
-
Save drussellmrichie/0d308585c711d1525890 to your computer and use it in GitHub Desktop.
My attempt at an ANN for predicting product category in the Otto Kaggle competition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
With 93/120/9 FFN with sigmoid hidden layer, and fixing the activations on | |
the output layer (make them sum to one, put on interval 0-1), and train for | |
5 epochs, only get score of 1.28 | |
""" | |
import pandas as pd | |
import os | |
from pybrain.tools.shortcuts import buildNetwork | |
from pybrain.supervised.trainers import BackpropTrainer | |
#from pybrain.structure import TanhLayer | |
from pybrain.structure import SigmoidLayer | |
from pybrain.datasets import SupervisedDataSet | |
import numpy as np | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn import cross_validation | |
os.chdir('/Users/russellrichie/otto') | |
train = pd.read_csv('train.csv') | |
test = pd.read_csv('test.csv').drop('id',axis=1) | |
train.head() | |
train.tail() | |
# appears data are *not* randomized, so shuffle them to help ANN training | |
# or at least supposedly help training....not sure I've seen a difference in | |
# my testing... | |
train.reindex(np.random.permutation(train.index)) | |
trainX = train.drop(['id','target'], axis=1) | |
trainY = train['target'] | |
# Line below tells us whether there are missing values (there aren't)...should | |
# print False if there aren't any missing values | |
if True not in np.array(trainX.isnull()): | |
"print there are no missing values!" | |
# see freqs of labels (most freq is not quite 10x more freq than least freq) | |
trainY.value_counts().plot(kind='bar') | |
""" | |
Build and train a feedforward neural network!!!! | |
""" | |
# bias in input function; hidden layer activation function is tanh | |
net = buildNetwork(93, 120, 9, bias=True, #hiddenclass = TanhLayer) | |
hiddenclass = SigmoidLayer) | |
#net = buildNetwork(93, 200, 9, bias=False, hiddenclass = SigmoidLayer) | |
trainYvectorized = pd.get_dummies(trainY, prefix= 'target') | |
trainDs = SupervisedDataSet(93, 9) | |
for index, row in trainX.iterrows(): | |
trainDs.addSample(row, trainYvectorized.ix[index]) | |
trainer = BackpropTrainer(net, trainDs) | |
#trainer.trainUntilConvergence() | |
epochNumb = 5 # two epochs was better than one, but three was not better than two... | |
for epochInd in range(epochNumb): | |
print "current epoch number is {}".format(epochInd) | |
trainer.train() | |
# now try to classify test set, and see what kind of error we get on kaggle | |
# there must be a way to just directly make a dataset instance from a | |
# df/array-like, but i haven't figured it out.... | |
#testDs = ClassificationDataSet(93) | |
predictions = np.zeros(shape=(len(test),9)) | |
for index, row in test.iterrows(): | |
#testDs.addSample(row) | |
predictions[index,:] = net.activate(row) | |
# now, since the competition requires I output probabilities, we need to make | |
# them all between 0 and 1, and make each input's corresponding outputs sum to | |
# 1. | |
predictions = predictions.clip(0,1) | |
for index, prediction in enumerate(predictions): | |
predictions[index] = prediction / sum(prediction) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment