view src/de/mpiwg/dwinter/lucencetools/documents/.svn/text-base/OcropusLineDocument.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

/*     */ package de.mpiwg.dwinter.lucencetools.documents;
/*     */ 
/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
/*     */ import java.io.File;
/*     */ import java.io.FileNotFoundException;
/*     */ import java.io.PrintStream;
/*     */ import java.io.UnsupportedEncodingException;
/*     */ import org.apache.lucene.document.DateTools;
/*     */ import org.apache.lucene.document.DateTools.Resolution;
/*     */ import org.apache.lucene.document.Document;
/*     */ import org.apache.lucene.document.Field;
/*     */ import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
/*     */ 
/*     */ public class OcropusLineDocument
/*     */ {
/*     */   public static Document Document(File f, String cleanPath,String language, OCRDocument.OCRLine ocrline, String pageDimension, String textId)
/*     */     throws FileNotFoundException, UnsupportedEncodingException
/*     */   {
/*  65 */     return Document(f, cleanPath,language, ocrline, pageDimension, null, textId);
/*     */   }
/*     */ 
/*     */   public static Document Document(File f, String cleanPath,String language, OCRDocument.OCRLine ocrline, String pageDimension, String dcMetaData, String textId)
/*     */     throws FileNotFoundException, UnsupportedEncodingException
/*     */   {
/*  72 */     Document doc = new Document();
/*     */ 
/*  76 */     doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
				doc.add(new Field("cleanedPath", cleanPath, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*  77 */     doc.add(new Field("pageDimension", pageDimension, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*  78 */     if (dcMetaData == null)
/*  79 */       dcMetaData = "";
/*  80 */     System.out.println("++++++++++++++++++DCMD:" + dcMetaData);
/*  81 */     doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED));
/*     */ 
/*  90 */     doc.add(
/*  92 */       new Field("modified", 
/*  91 */       DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), 
/*  92 */       Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/*  94 */     doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/* 106 */     doc.add(new Field("contents", ocrline.content, Field.Store.NO, Field.Index.ANALYZED));
/* 107 */     doc.add(new Field("contentsNormalized", ocrline.content, Field.Store.NO, Field.Index.ANALYZED));
/*     */ 
/* 111 */     doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/* 114 */     doc.add(new Field("bbox", ocrline.bbox, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/* 116 */     doc.add(new Field("lineNumber", ocrline.lineNumber, Field.Store.YES, Field.Index.NOT_ANALYZED));
/* 117 */     return doc;
/*     */   }
/*     */ }

/* Location:           /private/tmp/fulltextIndexer.jar
 * Qualified Name:     de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument
 * JD-Core Version:    0.5.4
 */