drussellmrichie · March 25, 2015 16:59
diff --git a/ottoANN b/ottoANN
 """
 With 93/120/9 FFN with sigmoid hidden layer, and fixing the activations on 
 the output layer (make them sum to one, put on interval 0-1), and train for 
 5 epochs, only get score of 1.28
 """

 import pandas as pd
 import os
 from pybrain.tools.shortcuts import buildNetwork
 from pybrain.supervised.trainers import BackpropTrainer
 #from pybrain.structure import TanhLayer
 from pybrain.structure import SigmoidLayer
 from pybrain.datasets import SupervisedDataSet
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn import cross_validation

 os.chdir('/Users/russellrichie/otto')

 train = pd.read_csv('train.csv')
 test = pd.read_csv('test.csv').drop('id',axis=1)

 train.head()
 train.tail()

 # appears data are *not* randomized, so shuffle them to help ANN training
 # or at least supposedly help training....not sure I've seen a difference in
 # my testing...
 train.reindex(np.random.permutation(train.index))

 trainX = train.drop(['id','target'], axis=1)
 trainY = train['target']

 # Line below tells us whether there are missing values (there aren't)...should
 # print False if there aren't any missing values
 if True not in np.array(trainX.isnull()):
    "print there are no missing values!"

 # see freqs of labels (most freq is not quite 10x more freq than least freq)
 trainY.value_counts().plot(kind='bar')

 """
 Build and train a feedforward neural network!!!!
 """

 # bias in input function; hidden layer activation function is tanh
 net = buildNetwork(93, 120, 9, bias=True, #hiddenclass = TanhLayer)
                                          hiddenclass = SigmoidLayer)
 #net = buildNetwork(93, 200, 9, bias=False, hiddenclass = SigmoidLayer)
 trainYvectorized = pd.get_dummies(trainY, prefix= 'target')
 trainDs = SupervisedDataSet(93, 9)
 for index, row in trainX.iterrows():
    trainDs.addSample(row, trainYvectorized.ix[index])

 trainer = BackpropTrainer(net, trainDs)
 #trainer.trainUntilConvergence()
 epochNumb = 5 # two epochs was better than one, but three was not better than two...
 for epochInd in range(epochNumb):
    print "current epoch number is {}".format(epochInd)    
    trainer.train()

 # now try to classify test set, and see what kind of error we get on kaggle
 # there must be a way to just directly make a dataset instance from a 
 # df/array-like, but i haven't figured it out....
 #testDs = ClassificationDataSet(93)
 predictions = np.zeros(shape=(len(test),9))
 for index, row in test.iterrows():
    #testDs.addSample(row)
    predictions[index,:] = net.activate(row)

 # now, since the competition requires I output probabilities, we need to make
 # them all between 0 and 1, and make each input's corresponding outputs sum to 
 # 1.
 predictions = predictions.clip(0,1)
 for index, prediction in enumerate(predictions):
    predictions[index] = prediction / sum(prediction)
	"""
	With 93/120/9 FFN with sigmoid hidden layer, and fixing the activations on
	the output layer (make them sum to one, put on interval 0-1), and train for
	5 epochs, only get score of 1.28
	"""

	import pandas as pd
	import os
	from pybrain.tools.shortcuts import buildNetwork
	from pybrain.supervised.trainers import BackpropTrainer
	#from pybrain.structure import TanhLayer
	from pybrain.structure import SigmoidLayer
	from pybrain.datasets import SupervisedDataSet
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import cross_validation

	os.chdir('/Users/russellrichie/otto')

	train = pd.read_csv('train.csv')
	test = pd.read_csv('test.csv').drop('id',axis=1)

	train.head()
	train.tail()

	# appears data are not randomized, so shuffle them to help ANN training
	# or at least supposedly help training....not sure I've seen a difference in
	# my testing...
	train.reindex(np.random.permutation(train.index))

	trainX = train.drop(['id','target'], axis=1)
	trainY = train['target']

	# Line below tells us whether there are missing values (there aren't)...should
	# print False if there aren't any missing values
	if True not in np.array(trainX.isnull()):
	"print there are no missing values!"

	# see freqs of labels (most freq is not quite 10x more freq than least freq)
	trainY.value_counts().plot(kind='bar')

	"""
	Build and train a feedforward neural network!!!!
	"""

	# bias in input function; hidden layer activation function is tanh
	net = buildNetwork(93, 120, 9, bias=True, #hiddenclass = TanhLayer)
	hiddenclass = SigmoidLayer)
	#net = buildNetwork(93, 200, 9, bias=False, hiddenclass = SigmoidLayer)
	trainYvectorized = pd.get_dummies(trainY, prefix= 'target')
	trainDs = SupervisedDataSet(93, 9)
	for index, row in trainX.iterrows():
	trainDs.addSample(row, trainYvectorized.ix[index])

	trainer = BackpropTrainer(net, trainDs)
	#trainer.trainUntilConvergence()
	epochNumb = 5 # two epochs was better than one, but three was not better than two...
	for epochInd in range(epochNumb):
	print "current epoch number is {}".format(epochInd)
	trainer.train()

	# now try to classify test set, and see what kind of error we get on kaggle
	# there must be a way to just directly make a dataset instance from a
	# df/array-like, but i haven't figured it out....
	#testDs = ClassificationDataSet(93)
	predictions = np.zeros(shape=(len(test),9))
	for index, row in test.iterrows():
	#testDs.addSample(row)
	predictions[index,:] = net.activate(row)

	# now, since the competition requires I output probabilities, we need to make
	# them all between 0 and 1, and make each input's corresponding outputs sum to
	# 1.
	predictions = predictions.clip(0,1)
	for index, prediction in enumerate(predictions):
	predictions[index] = prediction / sum(prediction)