Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/lucencetools/documents/.svn/text-base/OcropusLineDocument.java.svn-base @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/lucencetools/documents/.svn/text-base/OcropusLineDocument.java.svn-base Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,59 @@ +/* */ package de.mpiwg.dwinter.lucencetools.documents; +/* */ +/* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument; +import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine; +/* */ import java.io.File; +/* */ import java.io.FileNotFoundException; +/* */ import java.io.PrintStream; +/* */ import java.io.UnsupportedEncodingException; +/* */ import org.apache.lucene.document.DateTools; +/* */ import org.apache.lucene.document.DateTools.Resolution; +/* */ import org.apache.lucene.document.Document; +/* */ import org.apache.lucene.document.Field; +/* */ import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +/* */ +/* */ public class OcropusLineDocument +/* */ { +/* */ public static Document Document(File f, String cleanPath,String language, OCRDocument.OCRLine ocrline, String pageDimension, String textId) +/* */ throws FileNotFoundException, UnsupportedEncodingException +/* */ { +/* 65 */ return Document(f, cleanPath,language, ocrline, pageDimension, null, textId); +/* */ } +/* */ +/* */ public static Document Document(File f, String cleanPath,String language, OCRDocument.OCRLine ocrline, String pageDimension, String dcMetaData, String textId) +/* */ throws FileNotFoundException, UnsupportedEncodingException +/* */ { +/* 72 */ Document doc = new Document(); +/* */ +/* 76 */ doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(new Field("cleanedPath", cleanPath, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* 77 */ doc.add(new Field("pageDimension", pageDimension, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* 78 */ if (dcMetaData == null) +/* 79 */ dcMetaData = ""; +/* 80 */ System.out.println("++++++++++++++++++DCMD:" + dcMetaData); +/* 81 */ doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED)); +/* */ +/* 90 */ doc.add( +/* 92 */ new Field("modified", +/* 91 */ DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), +/* 92 */ Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 94 */ doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 106 */ doc.add(new Field("contents", ocrline.content, Field.Store.NO, Field.Index.ANALYZED)); +/* 107 */ doc.add(new Field("contentsNormalized", ocrline.content, Field.Store.NO, Field.Index.ANALYZED)); +/* */ +/* 111 */ doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 114 */ doc.add(new Field("bbox", ocrline.bbox, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 116 */ doc.add(new Field("lineNumber", ocrline.lineNumber, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* 117 */ return doc; +/* */ } +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument + * JD-Core Version: 0.5.4 + */ \ No newline at end of file