The EclairJS Nashorn API exposes the Spark programming model to JavaScript. EclairJS Nashorn is built on top of Spark's Java API. For a NodeJS implementation of the Spark programming model visit the eclairjs-node project.
Prerequisites
git clone git@github.com:EclairJS/eclairjs-nashorn.git
mvn package
export SPARK_HOME=<location of Spark binary distribution>
bin/eclairjs.sh examples/word_count.js
or
bin/eclairjs.sh
eclairJS>var list = sc.parallelize([1,10,20,30,40]);
list.count();
var SparkConf = require('eclairjs/SparkConf');
var SparkContext = require('eclairjs/SparkContext');
var file = "src/test/resources/dream.txt"; // Should be some file on your system
var conf = new SparkConf().setAppName("JavaScript word count")
.setMaster("local[*]");
var sparkContext = new SparkContext(conf);
var rdd = sparkContext.textFile(file).cache();
var rdd2 = rdd.flatMap(function(sentence) {
return sentence.split(" ");
});
var rdd3 = rdd2.filter(function(word) {
return word.trim().length > 0;
});
var rdd4 = rdd3.mapToPair(function(word) {
return [word, 1];
});
var rdd5 = rdd4.reduceByKey(function(a, b) {
return a + b;
});
var rdd6 = rdd5.mapToPair(function(tuple) {
return [tuple[1]+0.0, tuple[0]];
})
var rdd7 = rdd6.sortByKey(false);
print("top 10 words = " + rdd7.take(10));
Prerequisites
- Jupyter
- Apache Toree Toree needs to be built for Apache Spark 1.6.0, set the environment variable
export APACHE_SPARK_VERSION=1.6.0
before building Toree withmake dist
and then publish to local maven repository withsbt publishM2
- Note: On the
sbt publishM2
you may see an error something like[error] (toree/compile:packageBin) java.util.zip.ZipException: duplicate entry: LICENSE
at the very end. This doesn't really effect anything and you should be able to proceed with the following instructions.
Instructions:
-
Edit pom.xml under eclairjs-nashorn project root and change the reference to
toree-assembly-<version>-incubating-SNAPSHOT.jar
to point to the one under your incubator-toree project e.g.<path to incubator-toree distribution>/toree/lib/toree-assembly-<version>-incubating-SNAPSHOT.jar
-
mvn package -Pnotebook
-
Edit kernel.json and update the following:
<path to incubator-toree distribution>/toree/bin/run.sh
"SPARK_OPTS": --jars file:<path to nashorn jar>
"SPARK_HOME": <path to spark 1.6.0 distribution>
- Copy kernel.json to
~/.ipython/kernels/eclair/kernel.json
- Gateway 4.0.0 and higher uses
~/Library/Jupyter/kernels/eclair
-
Create a directory for your notebook
mkdir ~/jsNotebook
-
Change to that directory
cd ~/jsNotebook
-
Start jupyter
ipython notebook
-
A browser will open
http://localhost:8888/tree
select the new->Spark 1.6.0 (EclairJS) -
Enter the following code in notebook cell and run
var SparkContext = require('eclairjs/SparkContext');
var sc = new SparkContext("local[*]", "myapp");
var rdd = sc.parallelize([10, 4, 2, 12, 3]);
eval("count = " + rdd.count());
It should be noted that the master branch is used for development and although every effort is made to keep it stable it could be in a slight state of flux depending on what is going on. Please see our releases page if you would like to download a stable version.
More detailed information is available in the Eclair Nashorn Wiki. Presentations and information about how to get involved is in Project and Community.