Created
May 27, 2015 17:11
-
-
Save tomelm/a019b1fe8317f512158b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# clf = sklearn.linear_model.LogisticRegression | |
# significant_terms = set of terms appearing more than n times in training | |
def classify(left_name, right_name): | |
""" | |
Classifies names using delta term analysis. | |
:return: | |
A tuple (p_is_duplicate, exact_match_rare_terms, one_side_rare_terms). | |
* p_is_duplicate is the score from the log-linear classifier. It's | |
probably the most relevant signal. | |
* exact_match_rare_terms is the number of terms that are RARE and | |
exact matches. This is generally a positive signal. | |
* one_side_rare_terms is a list of terms that are RARE and either | |
only occur on one side, or are partial matches. This is generally | |
a negative signal. | |
:rtype: (float, bool, bool) | |
""" | |
# calls an word aligner on the two names. Returns a list of | |
# ExactMatchTerm(string)/OneSideTerm(string) objects | |
delta_terms = get_name_delta_terms(left_name, right_name) | |
significant_delta_terms = [term for term in delta_terms if term.word in significant_terms] | |
(p_false, p_true), = clf.predict_proba(feature_vectorize(significant_delta_terms)) | |
# Signals outside of the classifier prediction | |
exact_match_rare_terms = sum( | |
int(term.is_rare() and isinstance(term, ExactMatchTerm)) | |
for term in significant_delta_terms | |
) | |
one_side_rare_terms = sum( | |
int(term.is_rare() and isinstance(term, OneSideTerm)) | |
for term in significant_delta_terms | |
) | |
return p_true, exact_match_rare_terms, one_side_rare_terms |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment