Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/lucencetools/documents/FileDocument.java Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,73 @@ +/* */ package de.mpiwg.dwinter.lucencetools.documents; +/* */ +/* */ import de.mpiwg.dwinter.lucencetools.analyzer.XMLFilteredReader; +/* */ import java.io.File; +/* */ import java.io.FileInputStream; +/* */ import java.io.IOException; +/* */ import java.io.Reader; +/* */ import org.apache.lucene.document.DateTools; +/* */ import org.apache.lucene.document.DateTools.Resolution; +/* */ import org.apache.lucene.document.Document; +/* */ import org.apache.lucene.document.Field; +/* */ import org.apache.lucene.document.Field.Index; +/* */ import org.apache.lucene.document.Field.Store; +/* */ +/* */ public class FileDocument +/* */ { + + public static String toXML(Document doc){ + //String path = doc.get("path"); + String cleanedPath = doc.get("cleanedPath"); + String textId = doc.get("textId"); + String md = doc.get("dcMetaData"); + String ret = "<result>"; + ret+= "<cleanedPath>"+cleanedPath+"</cleanedPath>"; + ret+= "<textId>"+textId.replace("/",":")+"</textId>"; + ret+= "<textIdCleaned>"+textId.replace("/","_")+"</textIdCleaned>"; + ret+= "<md>"+md+"</md>"; + ret+="</result>"; + return ret; + + } +/* */ public static Document Document(File f, String cleanedPath,String language, String textId) +/* */ throws IOException +/* */ { +/* 63 */ return Document(f, cleanedPath,language, null, textId); +/* */ } +/* */ +/* */ public static Document Document(File f, String cleanedPath,String language, String dcMetaData, String textId) +/* */ throws IOException +/* */ { +/* 70 */ Document doc = new Document(); +/* */ +/* 74 */ doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* 74 */ doc.add(new Field("cleanedPath", cleanedPath, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* 75 */ if (dcMetaData == null) +/* 76 */ dcMetaData = ""; +/* 77 */ doc.add(new Field("dcMetaData", dcMetaData, Field.Store.YES, Field.Index.ANALYZED)); +/* */ +/* 79 */ if (textId == null) +/* 80 */ textId = ""; +/* 81 */ doc.add(new Field("textId", textId, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 87 */ doc.add( +/* 89 */ new Field("modified", +/* 88 */ DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), +/* 89 */ Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 95 */ Reader in = new XMLFilteredReader(new FileInputStream(f), "UTF-8"); +/* */ +/* 98 */ doc.add(new Field("contents", in)); +/* */ +/* 105 */ doc.add(new Field("language", language, Field.Store.YES, Field.Index.NOT_ANALYZED)); +/* */ +/* 107 */ return doc; +/* */ } + + +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.lucencetools.documents.FileDocument + * JD-Core Version: 0.5.4 + */ \ No newline at end of file