Created
September 28, 2015 10:55
-
-
Save feupeu/9c524745467ad7c93bc0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Check if the body is empty | |
if(doc.body() != null) { | |
// Find all text on the page | |
final String text = doc.body().text(); | |
// Split the text into tokens | |
final ArrayList<String> tokens = new ArrayList<>(); | |
final StringTokenizer tokenizer = new StringTokenizer(text); | |
// Add all valid tokens | |
while(tokenizer.hasMoreTokens()) { | |
final String token = tokenizer.nextToken(); | |
if(!Constants.stopWords.contains(token)) { | |
// Add the stemmed token | |
tokens.add(Porter.stem(token)); | |
} | |
} | |
// Add all the words to the index | |
for (String term : tokens) { | |
Index.addTerm(term, url); | |
} | |
// System.out.println("Found " + tokens.size() + " terms."); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment