view src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

/*     */ package de.mpiwg.dwinter.lucencetools.documents;
/*     */ 
/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.XMLFilteredReader;
/*     */ import java.io.File;
/*     */ import java.io.FileInputStream;
/*     */ import java.io.IOException;
/*     */ import java.io.Reader;
/*     */ import org.apache.lucene.document.DateTools;
/*     */ import org.apache.lucene.document.DateTools.Resolution;
/*     */ import org.apache.lucene.document.Document;
/*     */ import org.apache.lucene.document.Field;
/*     */ import org.apache.lucene.document.Field.Index;
/*     */ import org.apache.lucene.document.Field.Store;
/*     */ 
/*     */ public class FileDocument
/*     */ {
			
			public static String toXML(Document doc){
				//String path = doc.get("path");
				String cleanedPath = doc.get("cleanedPath");
				String textId = doc.get("textId");
				String md = doc.get("dcMetaData");
				String ret = "<result>";
				ret+= "<cleanedPath>"+cleanedPath+"</cleanedPath>";
				ret+= "<textId>"+textId.replace("/",":")+"</textId>";
				ret+= "<textIdCleaned>"+textId.replace("/","_")+"</textIdCleaned>";
				ret+= "<md>"+md+"</md>";
				ret+="</result>";
				return ret;
				
			}
/*     */   public static Document Document(File f, String cleanedPath,String language, String textId)
/*     */     throws IOException
/*     */   {
/*  63 */     return Document(f, cleanedPath,language, null, textId);
/*     */   }
/*     */ 
/*     */   public static Document Document(File f, String cleanedPath,String language, String dcMetaData, String textId)
/*     */     throws IOException
/*     */   {
/*  70 */     Document doc = new Document();
/*     */ 
/*  74 */     doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
/*  74 */     doc.add(new Field("cleanedPath", cleanedPath, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*  75 */     if (dcMetaData == null)
/*  76 */       dcMetaData = "";
/*  77 */     doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED));
/*     */ 
/*  79 */     if (textId == null)
/*  80 */       textId = "";
/*  81 */     doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/*  87 */     doc.add(
/*  89 */       new Field("modified", 
/*  88 */       DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), 
/*  89 */       Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/*  95 */     Reader in = new XMLFilteredReader(new FileInputStream(f), "UTF-8");
/*     */ 
/*  98 */     doc.add(new Field("contents", in));
/*     */ 
/* 105 */     doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED));
/*     */ 
/* 107 */     return doc;
/*     */   }

			
/*     */ }

/* Location:           /private/tmp/fulltextIndexer.jar
 * Qualified Name:     de.mpiwg.dwinter.lucencetools.documents.FileDocument
 * JD-Core Version:    0.5.4
 */