1 2 3 4 | <file1.pdf>|<content of file1> <file2.pdf>|<content of file2> <file3.pdf>|<content of file3> ... |
1 2 3 4 5 6 7 8 9 10 11 12 13 | public TikaHelper(Configuration conf) { tika = new Tika(); String confDelimiter = conf.get("com.ibm.imte.tika.delimiter"); String confReplaceChar = conf.get("com.ibm.imte.tika.replaceCharacterWith"); if (confDelimiter != null ) this.delimiter = "["+ confDelimiter + "]"; if (confReplaceChar != null ) this.replaceWith = confReplaceChar; logger.info("Delimiter: " + delimiter); logger.info("Replace With character:" + replaceWith); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | public String readPath(InputStream stream) { try { String content = tika.parseToString(stream); content = content.replaceAll(delimiter, replaceWith); content = content.replaceAll(endLine, replaceWith); return content; } catch (Exception e) { logger.error("Malformed PDF for Tika: " + e.getMessage()); } return "Malformed PDF"; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | public class TikaInputFormat extends CombineFileInputFormat<Text, Text> { @Override public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new TikaRecordReader((CombineFileSplit) split, context); } @Override protected boolean isSplitable(JobContext context, Path file) { return false; } } |
1 2 3 4 5 6 7 8 | public TikaRecordReader(CombineFileSplit split, TaskAttemptContext context) throws IOException { this.paths = split.getPaths(); this.fs = FileSystem.get(context.getConfiguration()); this.split = split; this.tikaHelper = new TikaHelper(context.getConfiguration()); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (count >= split.getNumPaths()) { done = true; return false; //we have no more data to parse } Path path = null; key = new Text(); value = new Text(); try { path = this.paths[count]; } catch (Exception e) { return false; } currentStream = null; currentStream = fs.open(path); key.set(path.getName()); value.set(tikaHelper.readPath(currentStream)); currentStream.close(); count++; return true; //we have more data to parse } |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |