-
-
Save agramfort/726574 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author: Oliver Mitevski | |
References: | |
A Generalized Linear Model for Principal Component Analysis of Binary Data, | |
Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar | |
The code was translated and adapted from Jakob Verbeek's | |
"Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB | |
""" | |
import sys | |
import numpy as np | |
from scipy import linalg | |
# from scipy import * | |
# from numpy import * | |
# from scipy.linalg import diagsvd, svd | |
# from scipy.sparse import linalg | |
# import cPickle, gzip | |
import time | |
import pylab as pl | |
def bpca(X, L=2, max_iters=30): | |
X = np.asanyarray(X, dtype=np.float) | |
N, D = X.shape | |
# x = X | |
X = 2*X - 1 | |
delta = np.random.permutation(N) | |
Delta = X[delta[0], :] / 100 | |
U = 1e-4 * np.random.random( (N, L) ) | |
V = 1e-4 * np.random.random( (L, D) ) | |
#for c=1:C; Th(:,:,c) = U(:,:,c)*V(:,:,c) + ones(N,1)*Delta(c,:); end; | |
Th = np.zeros((N, D)) | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
for iter in range(max_iters): | |
print iter | |
# Update U | |
T = np.tanh(Th/2) / Th | |
B = np.dot(V, (X - T*Delta[None,:]).T) | |
for n in range(N): | |
A = np.dot(V * T[n,:][None,:], V.T) | |
U[n,:] = linalg.solve(A, B[:,n]).T | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
Q = np.random.random(N) | |
# normalize it | |
# Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float | |
Q = linalg.norm(Q) # gives the same result | |
# Update V | |
T = np.tanh(Th/2) / Th | |
U2 = np.c_[U, np.ones((N,1))] | |
# U2 = U2 * np.tile(Q, (L+1, 1)).T | |
U2 *= Q # XXX : equivalent as above if Q is a float | |
B = np.dot(U2.T, X) | |
Uo = np.c_[U, np.ones((N, 1))] | |
for d in range(D): | |
A = np.dot(U2.T * T[:, d][None,:], Uo) | |
V2 = linalg.solve(A, B[:, d]) | |
Delta[d] = V2[-1] | |
if L > 0: | |
V[:, d] = V2[0:-1] | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
print U.shape | |
# plotM1(U[0:10000:1,0:2], labels[0:10000:10]) | |
U1, S, V = linalg.svd(U) | |
Vh = V.T | |
U1 = np.mat(U1[:, 0:L]) | |
# Vh = np.mat(Vh[0:L, :]) | |
Vh = Vh[0:L, :] | |
codes = np.dot(U, Vh.T) | |
return codes | |
def main(): | |
# Load the dataset | |
from save_load import save_file, load_file | |
try: | |
input_data = load_file(sys.argv[1]) | |
sparse = False | |
except: | |
print 'loading sparse' | |
sparse = True | |
from scipy.io import mmio as mm | |
input_data = mm.mmread(sys.argv[1]) | |
input_data = np.array(input_data.todense()) | |
N, D = input_data.shape | |
print N, D | |
# make data binary | |
# for i in range(N): | |
# for j in range(D): | |
# if input_data[i,j] > 0.0: | |
# input_data[i,j] = 1.0 | |
# save_file('news-10-stemmed.index2200/binDict', input_data) | |
# print 'saved binary' | |
X = input_data | |
labels = load_file(sys.argv[2]) | |
start = time.clock() | |
codes = bpca(X, L=2, max_iters = 20) | |
print "Time for computing binary PCA took", time.clock()-start | |
save_file(sys.argv[3], codes) | |
if __name__ == "__main__": | |
n_samples, n_features = 10, 30 | |
np.random.seed(0) | |
X = np.random.randn(n_samples, n_features) | |
codes = bpca(X, L=2, max_iters=20) | |
print codes | |
# main() |
that's a fairly empiric validation :) I would have expected "it reproduces the results form the paper" :)
maybe it should be tested on different datasets. The one I used (tf-idf feature vectors) may not be the most suitable. Nevertheless it didn't fail, so there's hope ;)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
just by visualization of the labels of the 20-newsgroups. They seemed kind of clustered, so it's working.