diff src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,73 @@
+/*     */ package de.mpiwg.dwinter.lucencetools.documents;
+/*     */ 
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.XMLFilteredReader;
+/*     */ import java.io.File;
+/*     */ import java.io.FileInputStream;
+/*     */ import java.io.IOException;
+/*     */ import java.io.Reader;
+/*     */ import org.apache.lucene.document.DateTools;
+/*     */ import org.apache.lucene.document.DateTools.Resolution;
+/*     */ import org.apache.lucene.document.Document;
+/*     */ import org.apache.lucene.document.Field;
+/*     */ import org.apache.lucene.document.Field.Index;
+/*     */ import org.apache.lucene.document.Field.Store;
+/*     */ 
+/*     */ public class FileDocument
+/*     */ {
+			
+			public static String toXML(Document doc){
+				//String path = doc.get("path");
+				String cleanedPath = doc.get("cleanedPath");
+				String textId = doc.get("textId");
+				String md = doc.get("dcMetaData");
+				String ret = "<result>";
+				ret+= "<cleanedPath>"+cleanedPath+"</cleanedPath>";
+				ret+= "<textId>"+textId.replace("/",":")+"</textId>";
+				ret+= "<textIdCleaned>"+textId.replace("/","_")+"</textIdCleaned>";
+				ret+= "<md>"+md+"</md>";
+				ret+="</result>";
+				return ret;
+				
+			}
+/*     */   public static Document Document(File f, String cleanedPath,String language, String textId)
+/*     */     throws IOException
+/*     */   {
+/*  63 */     return Document(f, cleanedPath,language, null, textId);
+/*     */   }
+/*     */ 
+/*     */   public static Document Document(File f, String cleanedPath,String language, String dcMetaData, String textId)
+/*     */     throws IOException
+/*     */   {
+/*  70 */     Document doc = new Document();
+/*     */ 
+/*  74 */     doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*  74 */     doc.add(new Field("cleanedPath", cleanedPath, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*  75 */     if (dcMetaData == null)
+/*  76 */       dcMetaData = "";
+/*  77 */     doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED));
+/*     */ 
+/*  79 */     if (textId == null)
+/*  80 */       textId = "";
+/*  81 */     doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/*  87 */     doc.add(
+/*  89 */       new Field("modified", 
+/*  88 */       DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), 
+/*  89 */       Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/*  95 */     Reader in = new XMLFilteredReader(new FileInputStream(f), "UTF-8");
+/*     */ 
+/*  98 */     doc.add(new Field("contents", in));
+/*     */ 
+/* 105 */     doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */ 
+/* 107 */     return doc;
+/*     */   }
+
+			
+/*     */ }
+
+/* Location:           /private/tmp/fulltextIndexer.jar
+ * Qualified Name:     de.mpiwg.dwinter.lucencetools.documents.FileDocument
+ * JD-Core Version:    0.5.4
+ */
\ No newline at end of file