Created
March 4, 2015 19:37
-
-
Save sunilmallya/ba302f594011938e44ad to your computer and use it in GitHub Desktop.
Compute cosine score
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import stemmer | |
def irange(sequence): | |
return zip(range(len(sequence)), sequence) | |
class CosineScore(object): | |
def __init__(self,all_docs): | |
self.documents = all_docs #list all docs [doc1,doc2..] | |
self.ndocs = len(all_docs) | |
self.posting_list = {} #term frequency list, don't care about term position | |
#term => {docId => freq} | |
self.pstemmer = stemmer.PorterStemmer() | |
self._term_indexer() | |
def _term_indexer(self): | |
#Create term frequency dict | |
#Run each word through stemmer | |
for doc_id,document in irange(self.documents): | |
for word in document.split(' '): | |
s_word = self.pstemmer.stem(word) | |
if self.posting_list.has_key(s_word): | |
doc_id_mapping = self.posting_list[s_word] | |
if doc_id_mapping.has_key(doc_id): | |
doc_id_mapping[doc_id] += 1 | |
else: | |
doc_id_mapping[doc_id] = 1 | |
else: | |
self.posting_list[s_word] = {doc_id: 1} | |
def _term_frequency(self,term): | |
if self.posting_list.has_key(term): | |
return self.posting_list[term] | |
else: | |
return -1 | |
def _listToString(self,arg): | |
if isinstance(arg,basestring): | |
return arg.split(' ') | |
def __qTermFrequency(self,term,bWords): | |
count =0 | |
for i,bWordsObj in irange(bWords): | |
if bWordsObj == term: | |
count = count +1 | |
return count | |
def _docListWeights(self) : | |
all_terms = self.posting_list.keys() | |
doclist_weights = [0.0] * self.ndocs | |
#for all terms in the corpus | |
for i,term in irange(all_terms): | |
#for all docs in corpus that contain this term | |
docs = self.posting_list[term].keys() | |
for j,doc_id in irange(docs): | |
tf = self.posting_list[term][doc_id] | |
tfSquared = (tf * tf) | |
doclist_weights[doc_id] += tfSquared | |
for k in range(self.ndocs): | |
doclist_weights[k] = math.sqrt(doclist_weights[k]) | |
return doclist_weights | |
def compute(self,query,mIDF=0): | |
''' | |
dft - document term frequency | |
idf - inverse document frequency | |
wTQ - weights for each query term | |
mIDF - max tf normalization | |
''' | |
scores = [0.0] * self.ndocs | |
bWords = self._listToString(query) | |
normalizationFactor = self._docListWeights() | |
for qterm in bWords: | |
term = self.pstemmer.stem(qterm) | |
#calculate WT | |
#dft = __qTermFrequency(queryTerm,bWords) | |
#wTQ = math.log10(int(N)/dft) | |
term_posting_doclist = [] | |
if self._term_frequency(term) != -1: | |
#Find all documents with this query term | |
term_posting_doclist = self.posting_list[term].keys() | |
#total_term_frequency_in_corpus = sum(self.posting_list[term].values()) | |
if(mIDF!=0): | |
dft = mIDF | |
else: | |
dft = len(term_posting_doclist) | |
_wTQ = float(self.ndocs)/float(dft) | |
wTQ = math.log10(float(_wTQ)) #idf | |
#cosinescore algorithm | |
for doc_id in term_posting_doclist: | |
if normalizationFactor[doc_id] != 0: | |
#wFTD = termDocFrequencyList/ normalizationFactor(doc_id) | |
wFTD = self.posting_list[term][doc_id] / float(normalizationFactor[doc_id]) | |
else: | |
wFTD = 0.0 | |
scores[doc_id] += (wTQ * wFTD) | |
return scores | |
if __name__ == "__main__": | |
docs = [ "mallya","mallya mallya in hawaii", "sunil" ] | |
q = "hawaii mallya" | |
cs = CosineScore(docs) | |
print cs.compute(q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
import stemmer
ModuleNotFoundError: No module named 'stemmer'
help required plz help