Tuesday, August 18, 2015

Run a simple java app in Spark and Hdfs

1. Started Hadoop.
2. Created a maven project in IntelliJ.
<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0"         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>edu.berkeley</groupId>
    <artifactId>simple-project</artifactId>
    <name>Simple Project</name>
    <packaging>jar</packaging>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency> <!-- Spark dependency -->            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version>1.3.1</version>
        </dependency>
    </dependencies>

</project>

-----------------------------------

/* SimpleApp.java */import org.apache.spark.api.java.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;

public class SimpleJava {
    public static void main(String[] args) {
        String logFile = "/user/XXXXXXXX/input/a.log"; // Should be some file on your system        SparkConf conf = new SparkConf().setAppName("Simple Application");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> logData = sc.textFile(logFile).cache();

        long numAs = logData.filter(new Function<String, Boolean>() {
            public Boolean call(String s) { return s.contains("a"); }
        }).count();

        long numBs = logData.filter(new Function<String, Boolean>() {
            public Boolean call(String s) { return s.contains("b"); }
        }).count();

        System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);

        long numByField = logData.filter(new Function<String, Boolean>() {
    public Boolean call(String s) {
        String[] token = s.split(";");
        boolean existed = false;
        for (int i = 0; i < token.length; i++) {
            if (i == 7) {
                String timeInHdfs = token[i]; //2015-06-30 14:00:29.0                System.out.println(timeInHdfs);
                if (!timeInHdfs.equalsIgnoreCase("null") && timeInHdfs.compareTo("2015-06-29 23:59:59") > 0) {
                    existed = true;
                }
            }
        }
        return existed;
    }
}).count();
System.out.println("-----------------------------------------------------------------------");
System.out.println("Lines with bigger time: numByField: " + numByField);
    }
}

-----------------------------------
notes: when created artifact, select "link by META-INF" rather than build in.

Run:
$ ./bin/spark-submit --class "SimpleJava" --master local[4] ~/work/dev/bigdata/SimpleJava/out/artifacts/SimpleJava_jar/SimpleJava.jar
Check in web ui:
$ ./sbin/start-all.sh
visit http://localhost:8080

No comments:

Post a Comment