-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathREADME.txt
executable file
·27 lines (19 loc) · 1.28 KB
/
README.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
bin/spark-shell --master local[4] --executor-memory 3g --driver-memory 3g --driver-class-path "/Users/raphael/readr-spark/spark-netlib/target/scala-2.10/spark-netlib-assembly-1.0-SNAPSHOT.jar" --jars "/Users/raphael/readr-spark/spark-readr/target/scala-2.10/spark-readr-assembly-1.0-SNAPSHOT.jar,/Users/raphael/readr-spark/spark-distsim/target/scala-2.10/spark-distsim-assembly-1.0-SNAPSHOT.jar" --driver-java-options "-Dspark.serializer=org.apache.spark.serializer.KryoSerializer -Dspark.kryo.registrator=com.readr.spark.MyRegistrator -Dspark.kryoserializer.buffer.mb=512 -Dcom.github.fommil.netlib.BLAS=com.github.fommil.netlib.NativeRefBLAS Dspark.executor.memory=4G"
val sourceDir = "/Users/raphael/data/source/barrons-4th-grade"
val outDir = "/Users/raphael/data/processed/barrons-4th-grade"
implicit val isc = sc
import com.readr.spark._
import com.readr.spark.rr._
import com.readr.spark.index._
import com.readr.spark.allenai._
import com.readr.spark.stanford34._
import com.readr.spark.other._
import com.readr.spark.cj._
import com.readr.spark.frame._
import com.readr.spark.distsim._
implicit val se = new Schema
val a = read(sourceDir, se).repartition(8)
val b = annotate(a, new FactorieSegmenter, se)
val c = annotate(b, new FactorieTokenizer, se)
c.persist()
LSAOnSentences.run(outDir, c)