1 2 3 4 5 6 7 8 | public TikaJaqlRecordReader(Configuration conf, MultiFileSplit split) throws IOException { this.split = split; this.conf = conf; this.paths = split.getPaths(); this.tikaHelper = new TikaHelper(conf); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | public boolean next(JsonHolder key, JsonHolder value) throws IOException { if (count >= split.getNumPaths()) { done = true; return false; } Path file = paths[count]; fs = file.getFileSystem(conf); InputStream stream = fs.open(file); BufferedJsonRecord bjr = new BufferedJsonRecord(); bjr.setNotSorted(); bjr.add(new JsonString("path"), new JsonString(file.getName())); bjr.add(new JsonString("content"), new JsonString(this.tikaHelper.readPath(stream))); value.setValue(bjr); stream.close(); count++; return true; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | addRelativeClassPath(getSystemSearchPath(), '/home/biadmin/tika-app-1.5.jar,/hom e/biadmin/TikaJaql.jar'); //creating the function tikaRead = fn ( location : string, inoptions : {*}? = null, outoptions : {*}? = null ) { location, "inoptions": { "adapter": "com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter", "format": "com.ibm.imte.tika.jaql.TikaJaqlInputFormat", "configurator": "com.ibm.jaql.io.hadoop.FileInputConfigurator" } }; |
1 2 3 4 | review1.doc review2.doc review3.doc ... |
1 2 3 4 5 | import tika(*); read(tikaRead("/tmp/reviews")) //You could put data transformations here -> write(del("/tmp/output", {schema:schema{path,content}, delimiter:"|", quoted:true})); |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |