diff src/de/mpiwg/dwinter/lucencetools/documents/OcropusLineDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/lucencetools/documents/OcropusLineDocument.java	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,59 @@
+/*     */ package de.mpiwg.dwinter.lucencetools.documents;
+/*     */ 
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
+import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
+/*     */ import java.io.File;
+/*     */ import java.io.FileNotFoundException;
+/*     */ import java.io.PrintStream;
+/*     */ import java.io.UnsupportedEncodingException;
+/*     */ import org.apache.lucene.document.DateTools;
+/*     */ import org.apache.lucene.document.DateTools.Resolution;
+/*     */ import org.apache.lucene.document.Document;
+/*     */ import org.apache.lucene.document.Field;
+/*     */ import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+/*     */ 
+/*     */ public class OcropusLineDocument
+/*     */ {
+/*     */   public static Document Document(File f, String cleanPath,String language, OCRDocument.OCRLine ocrline, String pageDimension, String textId)
+/*     */     throws FileNotFoundException, UnsupportedEncodingException
+/*     */   {
+/*  65 */     return Document(f, cleanPath,language, ocrline, pageDimension, null, textId);
+/*     */   }
+/*     */ 
+/*     */   public static Document Document(File f, String cleanPath,String language, OCRDocument.OCRLine ocrline, String pageDimension, String dcMetaData, String textId)
+/*     */     throws FileNotFoundException, UnsupportedEncodingException
+/*     */   {
+/*  72 */     Document doc = new Document();
+/*     */ 
+/*  76 */     doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
+				doc.add(new Field("cleanedPath", cleanPath, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*  77 */     doc.add(new Field("pageDimension", pageDimension, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*  78 */     if (dcMetaData == null)
+/*  79 */       dcMetaData = "";
+/*  80 */     System.out.println("++++++++++++++++++DCMD:" + dcMetaData);
+/*  81 */     doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED));
+/*     */ 
+/*  90 */     doc.add(
+/*  92 */       new Field("modified", 
+/*  91 */       DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), 
+/*  92 */       Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/*  94 */     doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/* 106 */     doc.add(new Field("contents", ocrline.content, Field.Store.NO, Field.Index.ANALYZED));
+/* 107 */     doc.add(new Field("contentsNormalized", ocrline.content, Field.Store.NO, Field.Index.ANALYZED));
+/*     */ 
+/* 111 */     doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/* 114 */     doc.add(new Field("bbox", ocrline.bbox, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/* 116 */     doc.add(new Field("lineNumber", ocrline.lineNumber, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/* 117 */     return doc;
+/*     */   }
+/*     */ }
+
+/* Location:           /private/tmp/fulltextIndexer.jar
+ * Qualified Name:     de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument
+ * JD-Core Version:    0.5.4
+ */
\ No newline at end of file