Created
January 12, 2018 10:25
-
-
Save bobbruno/69dfa9e47cd755f4613722586e96346c to your computer and use it in GitHub Desktop.
Stanford CoreNLP in Spark Scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
This class allows usage of CoreNLP in Spark, creating an instance of the pipeline on each worker so that the | |
code can run in parallel. | |
@param annotators: the CoreNLP annotator pipeline | |
@param params: the parameters desired for the annotators | |
*/ | |
class NLPPipeline(annotators: String, params: Tuple2[String, String]*) extends Serializable { | |
import edu.stanford.nlp.pipeline._ | |
import java.util.Properties | |
@transient private var nlpPipeline: StanfordCoreNLP = _ | |
/** | |
Returns a CoreNLP pipeline local to the worker, using the constructor parameters | |
*/ | |
private def getOrCreatePipeline(): StanfordCoreNLP = { | |
if (nlpPipeline == null) { | |
val props = new Properties() | |
props.setProperty("annotators", annotators) | |
if (params.nonEmpty) params.map{p => props.setProperty(p._1, p._2)} | |
nlpPipeline = new StanfordCoreNLP(props) | |
} | |
nlpPipeline | |
} | |
/** | |
Basic step of the pipeline, transforming any text into a CoreNLP document. | |
@param keyword: the text to be transformed | |
*/ | |
def transform(keyword: String) = { | |
val pipeline = getOrCreatePipeline() | |
pipeline.process(keyword) | |
} | |
} | |
/** | |
Example object implementing the lemmatization pipeline | |
*/ | |
object Lemma extends NLPPipeline("tokenize, ssplit, pos, lemma") { | |
import edu.stanford.nlp.ling.CoreAnnotations._ | |
import scala.collection.JavaConversions._ | |
/** | |
Helper class to give nice structure to the results in a DataFrame | |
*/ | |
case class Lemmas(tokens: Seq[String], lemmas: Seq[String]) | |
/** | |
udf to run the pipeline on a dataframe column. | |
*/ | |
def lemmatize = udf((keyword: String) => { | |
val doc = transform(keyword) | |
val tokens = doc.get(classOf[SentencesAnnotation]).flatMap(_.get(classOf[TokensAnnotation])) | |
Lemmas(tokens.map(_.get(classOf[TextAnnotation])), tokens.map(_.get(classOf[LemmaAnnotation]))) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment