This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val version = "3.7.0" // CoreNLP version the model will be used with | |
val model = s"stanford-corenlp-$version-models" // append "-english" to use the full English model | |
if (!sc.listJars.exists(jar => jar.contains(model))) { | |
import scala.sys.process._ | |
s"wget http://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/$version/$model.jar -O /tmp/$model.jar".!! | |
sc.addJar(s"/tmp/$model.jar") | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
This class allows usage of CoreNLP in Spark, creating an instance of the pipeline on each worker so that the | |
code can run in parallel. | |
@param annotators: the CoreNLP annotator pipeline | |
@param params: the parameters desired for the annotators | |
*/ | |
class NLPPipeline(annotators: String, params: Tuple2[String, String]*) extends Serializable { | |
import edu.stanford.nlp.pipeline._ | |
import java.util.Properties |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def bufferise(defbuf=20, defskip=0): | |
def decorate(function): | |
def wrapper(*args, **kwargs): | |
bufsize = kwargs['bufsize'] if 'bufsize' in kwargs else defbuf | |
skiplines = kwargs['skiplines'] if 'skiplines' in kwargs else defskip | |
print 'Bufsize = {}'.format(bufsize) | |
print 'Skip {} lines'.format(skiplines) | |
if skiplines: | |
for i, record in enumerate(function(*args, **kwargs), start=1): | |
if i > skiplines: |