fulltextIndexer: src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java comparison

comparison src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java @ 0:dc7622afcfea default tip

initial

author	dwinter
date	Wed, 03 Nov 2010 12:33:16 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:dc7622afcfea
+/*     */ package de.mpiwg.dwinter.lucencetools.documents;
+/*     */
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.XMLFilteredReader;
+/*     */ import java.io.File;
+/*     */ import java.io.FileInputStream;
+/*     */ import java.io.IOException;
+/*     */ import java.io.Reader;
+/*     */ import org.apache.lucene.document.DateTools;
+/*     */ import org.apache.lucene.document.DateTools.Resolution;
+/*     */ import org.apache.lucene.document.Document;
+/*     */ import org.apache.lucene.document.Field;
+/*     */ import org.apache.lucene.document.Field.Index;
+/*     */ import org.apache.lucene.document.Field.Store;
+/*     */
+/*     */ public class FileDocument
+/*     */ {
+			public static String toXML(Document doc){
+				//String path = doc.get("path");
+				String cleanedPath = doc.get("cleanedPath");
+				String textId = doc.get("textId");
+				String md = doc.get("dcMetaData");
+				String ret = "<result>";
+				ret+= "<cleanedPath>"+cleanedPath+"</cleanedPath>";
+				ret+= "<textId>"+textId.replace("/",":")+"</textId>";
+				ret+= "<textIdCleaned>"+textId.replace("/","_")+"</textIdCleaned>";
+				ret+= "<md>"+md+"</md>";
+				ret+="</result>";
+				return ret;
+			}
+/*     */   public static Document Document(File f, String cleanedPath,String language, String textId)
+/*     */     throws IOException
+/*     */   {
+/*  63 */     return Document(f, cleanedPath,language, null, textId);
+/*     */   }
+/*     */
+/*     */   public static Document Document(File f, String cleanedPath,String language, String dcMetaData, String textId)
+/*     */     throws IOException
+/*     */   {
+/*  70 */     Document doc = new Document();
+/*     */
+/*  74 */     doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*  74 */     doc.add(new Field("cleanedPath", cleanedPath, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*  75 */     if (dcMetaData == null)
+/*  76 */       dcMetaData = "";
+/*  77 */     doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED));
+/*     */
+/*  79 */     if (textId == null)
+/*  80 */       textId = "";
+/*  81 */     doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */
+/*  87 */     doc.add(
+/*  89 */       new Field("modified",
+/*  88 */       DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
+/*  89 */       Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */
+/*  95 */     Reader in = new XMLFilteredReader(new FileInputStream(f), "UTF-8");
+/*     */
+/*  98 */     doc.add(new Field("contents", in));
+/*     */
+/* 105 */     doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED));
+/*     */
+/* 107 */     return doc;
+/*     */   }
+/*     */ }
+/* Location:           /private/tmp/fulltextIndexer.jar
+* Qualified Name:     de.mpiwg.dwinter.lucencetools.documents.FileDocument
+* JD-Core Version:    0.5.4
+*/

Mercurial > hg > fulltextIndexer

comparison src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java @ 0:dc7622afcfea default tip