changeset 4:e93de4e99b52 default tip

indexMeta2rdf in dieses Projekt verschoben
author dwinter
date Thu, 21 Jun 2012 14:37:55 +0200
parents 6c8dac2c5214
children
files .classpath lib/jdom-1.0.jar src/de/mpiwg/itgroup/indexMeta2RDF/IndexMetaIterator.java src/de/mpiwg/itgroup/indexMeta2RDF/TransformIndexMeta.java
diffstat 4 files changed, 328 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/.classpath	Thu Jun 21 12:24:29 2012 +0200
+++ b/.classpath	Thu Jun 21 14:37:55 2012 +0200
@@ -9,6 +9,7 @@
 	<classpathentry kind="lib" path="lib/protege-owl.jar"/>
 	<classpathentry kind="lib" path="lib/protege.jar"/>
 	<classpathentry kind="lib" path="lib/openrdf-sesame-2.6.6-onejar.jar"/>
+	<classpathentry kind="lib" path="lib/jdom-1.0.jar"/>
 	<classpathentry kind="con" path="org.eclipse.jst.j2ee.internal.module.container"/>
 	<classpathentry kind="con" path="org.eclipse.jst.server.core.container/org.eclipse.jst.server.tomcat.runtimeTarget/Apache Tomcat v6.0">
 		<attributes>
Binary file lib/jdom-1.0.jar has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/itgroup/indexMeta2RDF/IndexMetaIterator.java	Thu Jun 21 14:37:55 2012 +0200
@@ -0,0 +1,145 @@
+package de.mpiwg.itgroup.indexMeta2RDF;
+
+
+/*
+ * Copyright  2000-2004 The Apache Software Foundation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Stack;
+import java.util.Vector;
+
+
+
+import org.apache.log4j.Logger;
+import org.jdom.Document;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+
+
+/**
+ * An iterator which iterates through the contents of a java directory. The
+ * iterator should be created with the directory at the root of the Java
+ * namespace.
+ *
+ */
+public class IndexMetaIterator implements Iterator<String> {
+
+	private File rootFolder;
+	private File currentFolder;
+	private Stack<String> stack;
+	private ArrayList<String>filter; //Array of paths which shouldn'T be indexed
+
+	public IndexMetaIterator(File rootFolder) throws IOException{
+		
+		filter = new ArrayList<String>();
+		filter.add("/mpiwg/online/permanent/SudanRockArt"); // TODO: make this configurable
+		
+		this.rootFolder=rootFolder;
+		this.currentFolder=rootFolder;
+		this.stack = new Stack<String>();
+		
+		for (String f:rootFolder.list()){
+			String fn = rootFolder.getCanonicalPath()+"/"+f;
+			if (!filter.contains(fn)){
+				if (!f.equals("")){ // FIXME some filesystems (sshfs?) gives empty filenames if the path contains special characters.
+					stack.push(fn);}
+				else {
+					Logger.getLogger("notAddedFilesLogger").info("Folder -" +fn+" contains files with charakters I cannot read!" );
+				}
+			}
+		}
+	}
+	@Override
+	public boolean hasNext() {
+		// TODO Auto-generated method stub
+		return !stack.isEmpty();
+	}
+
+	@Override
+	public String next() {
+		// TODO Auto-generated method stub
+		String nextFile = stack.pop();
+		while(!nextFile.endsWith(".meta") && !stack.isEmpty()){
+			System.out.println("CHECK_________"+nextFile);
+			
+		
+			if(!nextFile.endsWith("pageimg")){ //skip pageimg
+				
+			
+				File nf = new File(nextFile);
+				
+			
+				if(nf.isDirectory()){
+					String[] ls = nf.list();
+					if (ls==null){
+						return null;
+					}
+					for (String f:ls){
+						String fn;
+						try {
+							if (!f.startsWith(".")){
+							fn = nf.getCanonicalPath()+"/"+f;
+							if (!filter.contains(fn)){
+								if (!f.equals("")) {// FIXME some filesystems (sshfs?) gives empty filenames if the path contains special characters.
+									stack.push(fn);}
+									else {
+										Logger.getLogger("notAddedFilesLogger").info("Folder -" +fn+" contains files with characters I cannot read!" );
+									}
+							
+							}
+							}
+						} catch (IOException e) {
+							// TODO Auto-generated catch block
+							e.printStackTrace();
+						}
+						
+					}
+				}
+			}
+			
+			nextFile = stack.pop();
+			
+		}
+		if (!nextFile.endsWith(".meta")) //der letzte Eintrag muss noch gretrennt getestet werden.
+			nextFile = null;
+		System.out.println("FOUND:"+nextFile);
+	
+		if (nextFile!=null)
+			return nextFile;
+		
+		return null;
+	}
+
+	
+	@Override
+	public void remove() {
+		// TODO Auto-generated method stub
+		
+	}
+
+ 
+
+}
+
+
+            
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/itgroup/indexMeta2RDF/TransformIndexMeta.java	Thu Jun 21 14:37:55 2012 +0200
@@ -0,0 +1,182 @@
+package de.mpiwg.itgroup.indexMeta2RDF;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.jdom.Attribute;
+import org.jdom.Document;
+import org.jdom.Element;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+import org.jdom.xpath.XPath;
+import org.openrdf.model.Statement;
+import org.openrdf.model.impl.LiteralImpl;
+import org.openrdf.model.impl.StatementImpl;
+import org.openrdf.model.impl.URIImpl;
+import org.openrdf.rio.RDFHandlerException;
+import org.openrdf.rio.trig.TriGWriter;
+import org.openrdf.rio.turtle.TurtleWriter;
+import org.openrdf.model.Value;
+
+
+
+
+	
+public class TransformIndexMeta {
+	String OBJ_BASE_URL="http://echo.mpiwg-berlin.mpg.de/indexMeta/";
+	String ONT_BASE_URL="http://ontologies.mpiwg-berlin.mpg.de/general/MetaData/";
+	private String indexMetaType ="http://ontologies.mpiwg-berlin.mpg.de/general/IndexMeta";
+	private String bibObjType ="http://ontologies.mpiwg-berlin.mpg.de/general/BibData";
+	private FileWriter out;
+	private FileWriter error;
+	private TurtleWriter turtleWriter;
+	
+	
+	public  TransformIndexMeta(FileWriter fw, FileWriter ew){
+		out=fw;
+		error=ew;
+		turtleWriter = new TurtleWriter(fw);
+	}
+	public void transform(String metaData) throws IOException, JDOMException, RDFHandlerException{
+	//URL url = new URL(metaData);
+	//InputStream is = url.openStream();
+	
+	turtleWriter.startRDF();
+	FileInputStream is = new FileInputStream(metaData);
+	Document doc;
+	try {
+		doc = new SAXBuilder().build(is);
+	} catch (Exception e1) {
+		// TODO Auto-generated catch block
+		System.err.println("Cannot parse:"+metaData);
+		error.write("cannotparse:"+metaData+"\n");
+		return;
+	}
+	
+	XPath xpDri = XPath.newInstance("//meta/dri[@type=\"escidoc-test\"]");
+	
+	Element result = (Element)xpDri.selectSingleNode(doc);
+	
+	if (result==null){
+		System.err.println("No dri for:"+metaData);
+		error.write("Non dri for:"+metaData+"\n");
+		return;
+	}
+		
+	String dri=result.getTextTrim();
+	
+	String objIdent=OBJ_BASE_URL+dri;
+	
+	
+	//out.write(String.format("<%s> rdf:type <%s>.\n", objIdent,indexMetaType));
+	
+	Statement smt = new StatementImpl(new URIImpl(objIdent), new URIImpl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), new URIImpl(indexMetaType));
+	turtleWriter.handleStatement(smt);
+	
+	Element resElement = doc.getRootElement();
+
+	for (Object n: resElement.getChildren()){
+		if (Element.class.isInstance(n))	{
+			Element e = (Element)n;
+			
+			if (!e.getTextTrim().equals("")){
+				String txt=e.getTextTrim();//.replace("\"","\\\"");
+				
+				smt = new StatementImpl
+						(new URIImpl(objIdent), new URIImpl(ONT_BASE_URL+e.getName()), (Value)(new LiteralImpl(txt)));
+				turtleWriter.handleStatement(smt);
+				
+				
+				//out.write(String.format("<%s> <%s> \"%s\".\n", objIdent,ONT_BASE_URL+e.getName(),txt));
+			}
+		}
+	
+	
+	}
+	
+	XPath bib = XPath.newInstance("//meta/bib");
+	
+	Element bibElement = (Element)bib.selectSingleNode(doc);
+	if (bibElement==null){
+		System.err.println("No bibelement in:"+metaData);
+		error.write("No bibelement in:"+metaData+"\n");
+		return;
+	}
+	String bibIdent=objIdent+":bib";
+	
+	smt = new StatementImpl(new URIImpl(bibIdent), new URIImpl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), new URIImpl(bibObjType));
+	turtleWriter.handleStatement(smt);
+	
+	//out.write(String.format("<%s> rdf:type <%s>.\n",bibIdent,bibObjType));
+	
+	smt = new StatementImpl(new URIImpl(objIdent), new URIImpl(ONT_BASE_URL+"has_bibl_metaData"), new URIImpl(bibIdent));
+	turtleWriter.handleStatement(smt);
+	
+	//out.write(String.format("<%s> <%s> <%s>.\n", objIdent,ONT_BASE_URL+"has_bibl_metaData",bibIdent));
+	
+	Attribute bibType = bibElement.getAttribute("type");
+	if (bibType==null){
+		System.err.println("No bibtype in:"+metaData);
+		error.write("No bibtype in:"+metaData+"\n");
+		return;
+	}
+	String type=bibType.getValue();
+	
+	smt = new StatementImpl
+			(new URIImpl(bibIdent), new URIImpl(ONT_BASE_URL+"is_of_type"), (Value)(new LiteralImpl(type)));
+	turtleWriter.handleStatement(smt);
+
+	//out.write(String.format("<%s> <%s> \"%s\".\n", bibIdent,ONT_BASE_URL+"is_of_type",type));
+	
+	for (Object n: bibElement.getChildren()){
+		if (Element.class.isInstance(n))	{
+			Element e = (Element)n;
+			String txt=e.getTextTrim();//.replace("\"","\\\"");
+			smt = new StatementImpl
+					(new URIImpl(bibIdent), new URIImpl(ONT_BASE_URL+e.getName()), (Value)(new LiteralImpl(txt)));
+			turtleWriter.handleStatement(smt);
+
+			//out.write(String.format("<%s> <%s> \"%s\".\n", bibIdent,ONT_BASE_URL+e.getName(),txt));
+		}
+	
+	
+	}
+	turtleWriter.endRDF();
+	
+}
+	
+	static public void main(String[] args) throws IOException, RDFHandlerException{
+		if (args.length!=1){
+			System.out.println("Usage: transfom path");
+			System.exit(1);
+		}
+		String root = args[0];
+		FileWriter fw = new FileWriter("/tmp/out.rdf");
+		FileWriter ew = new FileWriter("/tmp/errors_transform.txt");
+		TransformIndexMeta tim = new TransformIndexMeta(fw,ew);
+		
+		Iterator<String> it = new IndexMetaIterator(new File(root));
+		while (it.hasNext()){
+			String nx = it.next();
+		try {
+			if(nx==null){
+				continue; //weiss noch nicht warum das passiert.
+			}
+			tim.transform(nx);
+		} catch (JDOMException e) {
+			System.out.println("JDOM exception:"+nx);
+			//e.printStackTrace();
+		}
+		fw.flush();
+		ew.flush();
+		}
+		fw.close();
+		ew.close();
+	}
+}