joaorafaelm · November 22, 2017 19:17
diff --git a/LDApredict.py b/LDApredict.py
 # derived from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
 # explanations are located there : https://www.linkedin.com/pulse/dissociating-training-predicting-latent-dirichlet-lucien-tardres

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 import pickle

 # create a blank model
 lda = LatentDirichletAllocation()

 # load parameters from file
 with open ('outfile', 'rb') as fd:
    (features,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_) = pickle.load(fd)

 # the dataset to predict on (first two samples were also in the training set so one can compare)
 data_samples = ["I like to eat broccoli and bananas.",
                "I ate a banana and spinach smoothie for breakfast.",
                "kittens and dogs are boring"
               ]
 # Vectorize the training set using the model features as vocabulary
 tf_vectorizer = CountVectorizer(vocabulary=features)
 tf = tf_vectorizer.fit_transform(data_samples)

 # transform method returns a matrix with one line per document, columns being topics weight
 predict = lda.transform(tf)
 print(predict)
	# derived from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
	# explanations are located there : https://www.linkedin.com/pulse/dissociating-training-predicting-latent-dirichlet-lucien-tardres

	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	import pickle

	# create a blank model
	lda = LatentDirichletAllocation()

	# load parameters from file
	with open ('outfile', 'rb') as fd:
	(features,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_) = pickle.load(fd)

	# the dataset to predict on (first two samples were also in the training set so one can compare)
	data_samples = ["I like to eat broccoli and bananas.",
	"I ate a banana and spinach smoothie for breakfast.",
	"kittens and dogs are boring"
	]
	# Vectorize the training set using the model features as vocabulary
	tf_vectorizer = CountVectorizer(vocabulary=features)
	tf = tf_vectorizer.fit_transform(data_samples)

	# transform method returns a matrix with one line per document, columns being topics weight
	predict = lda.transform(tf)
	print(predict)