# HG changeset patch # User dwinter # Date 1340282275 -7200 # Node ID e93de4e99b5223f8e26eab85ac8ea1d4dbcdb6f4 # Parent 6c8dac2c52145ac29d79aee42990b910f7b992ed indexMeta2rdf in dieses Projekt verschoben diff -r 6c8dac2c5214 -r e93de4e99b52 .classpath --- a/.classpath Thu Jun 21 12:24:29 2012 +0200 +++ b/.classpath Thu Jun 21 14:37:55 2012 +0200 @@ -9,6 +9,7 @@ + diff -r 6c8dac2c5214 -r e93de4e99b52 lib/jdom-1.0.jar Binary file lib/jdom-1.0.jar has changed diff -r 6c8dac2c5214 -r e93de4e99b52 src/de/mpiwg/itgroup/indexMeta2RDF/IndexMetaIterator.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/indexMeta2RDF/IndexMetaIterator.java Thu Jun 21 14:37:55 2012 +0200 @@ -0,0 +1,145 @@ +package de.mpiwg.itgroup.indexMeta2RDF; + + +/* + * Copyright 2000-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.Iterator; +import java.util.List; +import java.util.Stack; +import java.util.Vector; + + + +import org.apache.log4j.Logger; +import org.jdom.Document; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; + + +/** + * An iterator which iterates through the contents of a java directory. The + * iterator should be created with the directory at the root of the Java + * namespace. + * + */ +public class IndexMetaIterator implements Iterator { + + private File rootFolder; + private File currentFolder; + private Stack stack; + private ArrayListfilter; //Array of paths which shouldn'T be indexed + + public IndexMetaIterator(File rootFolder) throws IOException{ + + filter = new ArrayList(); + filter.add("/mpiwg/online/permanent/SudanRockArt"); // TODO: make this configurable + + this.rootFolder=rootFolder; + this.currentFolder=rootFolder; + this.stack = new Stack(); + + for (String f:rootFolder.list()){ + String fn = rootFolder.getCanonicalPath()+"/"+f; + if (!filter.contains(fn)){ + if (!f.equals("")){ // FIXME some filesystems (sshfs?) gives empty filenames if the path contains special characters. + stack.push(fn);} + else { + Logger.getLogger("notAddedFilesLogger").info("Folder -" +fn+" contains files with charakters I cannot read!" ); + } + } + } + } + @Override + public boolean hasNext() { + // TODO Auto-generated method stub + return !stack.isEmpty(); + } + + @Override + public String next() { + // TODO Auto-generated method stub + String nextFile = stack.pop(); + while(!nextFile.endsWith(".meta") && !stack.isEmpty()){ + System.out.println("CHECK_________"+nextFile); + + + if(!nextFile.endsWith("pageimg")){ //skip pageimg + + + File nf = new File(nextFile); + + + if(nf.isDirectory()){ + String[] ls = nf.list(); + if (ls==null){ + return null; + } + for (String f:ls){ + String fn; + try { + if (!f.startsWith(".")){ + fn = nf.getCanonicalPath()+"/"+f; + if (!filter.contains(fn)){ + if (!f.equals("")) {// FIXME some filesystems (sshfs?) gives empty filenames if the path contains special characters. + stack.push(fn);} + else { + Logger.getLogger("notAddedFilesLogger").info("Folder -" +fn+" contains files with characters I cannot read!" ); + } + + } + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + } + } + + nextFile = stack.pop(); + + } + if (!nextFile.endsWith(".meta")) //der letzte Eintrag muss noch gretrennt getestet werden. + nextFile = null; + System.out.println("FOUND:"+nextFile); + + if (nextFile!=null) + return nextFile; + + return null; + } + + + @Override + public void remove() { + // TODO Auto-generated method stub + + } + + + +} + + + diff -r 6c8dac2c5214 -r e93de4e99b52 src/de/mpiwg/itgroup/indexMeta2RDF/TransformIndexMeta.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/indexMeta2RDF/TransformIndexMeta.java Thu Jun 21 14:37:55 2012 +0200 @@ -0,0 +1,182 @@ +package de.mpiwg.itgroup.indexMeta2RDF; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; + +import org.jdom.Attribute; +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; +import org.jdom.xpath.XPath; +import org.openrdf.model.Statement; +import org.openrdf.model.impl.LiteralImpl; +import org.openrdf.model.impl.StatementImpl; +import org.openrdf.model.impl.URIImpl; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.trig.TriGWriter; +import org.openrdf.rio.turtle.TurtleWriter; +import org.openrdf.model.Value; + + + + + +public class TransformIndexMeta { + String OBJ_BASE_URL="http://echo.mpiwg-berlin.mpg.de/indexMeta/"; + String ONT_BASE_URL="http://ontologies.mpiwg-berlin.mpg.de/general/MetaData/"; + private String indexMetaType ="http://ontologies.mpiwg-berlin.mpg.de/general/IndexMeta"; + private String bibObjType ="http://ontologies.mpiwg-berlin.mpg.de/general/BibData"; + private FileWriter out; + private FileWriter error; + private TurtleWriter turtleWriter; + + + public TransformIndexMeta(FileWriter fw, FileWriter ew){ + out=fw; + error=ew; + turtleWriter = new TurtleWriter(fw); + } + public void transform(String metaData) throws IOException, JDOMException, RDFHandlerException{ + //URL url = new URL(metaData); + //InputStream is = url.openStream(); + + turtleWriter.startRDF(); + FileInputStream is = new FileInputStream(metaData); + Document doc; + try { + doc = new SAXBuilder().build(is); + } catch (Exception e1) { + // TODO Auto-generated catch block + System.err.println("Cannot parse:"+metaData); + error.write("cannotparse:"+metaData+"\n"); + return; + } + + XPath xpDri = XPath.newInstance("//meta/dri[@type=\"escidoc-test\"]"); + + Element result = (Element)xpDri.selectSingleNode(doc); + + if (result==null){ + System.err.println("No dri for:"+metaData); + error.write("Non dri for:"+metaData+"\n"); + return; + } + + String dri=result.getTextTrim(); + + String objIdent=OBJ_BASE_URL+dri; + + + //out.write(String.format("<%s> rdf:type <%s>.\n", objIdent,indexMetaType)); + + Statement smt = new StatementImpl(new URIImpl(objIdent), new URIImpl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), new URIImpl(indexMetaType)); + turtleWriter.handleStatement(smt); + + Element resElement = doc.getRootElement(); + + for (Object n: resElement.getChildren()){ + if (Element.class.isInstance(n)) { + Element e = (Element)n; + + if (!e.getTextTrim().equals("")){ + String txt=e.getTextTrim();//.replace("\"","\\\""); + + smt = new StatementImpl + (new URIImpl(objIdent), new URIImpl(ONT_BASE_URL+e.getName()), (Value)(new LiteralImpl(txt))); + turtleWriter.handleStatement(smt); + + + //out.write(String.format("<%s> <%s> \"%s\".\n", objIdent,ONT_BASE_URL+e.getName(),txt)); + } + } + + + } + + XPath bib = XPath.newInstance("//meta/bib"); + + Element bibElement = (Element)bib.selectSingleNode(doc); + if (bibElement==null){ + System.err.println("No bibelement in:"+metaData); + error.write("No bibelement in:"+metaData+"\n"); + return; + } + String bibIdent=objIdent+":bib"; + + smt = new StatementImpl(new URIImpl(bibIdent), new URIImpl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), new URIImpl(bibObjType)); + turtleWriter.handleStatement(smt); + + //out.write(String.format("<%s> rdf:type <%s>.\n",bibIdent,bibObjType)); + + smt = new StatementImpl(new URIImpl(objIdent), new URIImpl(ONT_BASE_URL+"has_bibl_metaData"), new URIImpl(bibIdent)); + turtleWriter.handleStatement(smt); + + //out.write(String.format("<%s> <%s> <%s>.\n", objIdent,ONT_BASE_URL+"has_bibl_metaData",bibIdent)); + + Attribute bibType = bibElement.getAttribute("type"); + if (bibType==null){ + System.err.println("No bibtype in:"+metaData); + error.write("No bibtype in:"+metaData+"\n"); + return; + } + String type=bibType.getValue(); + + smt = new StatementImpl + (new URIImpl(bibIdent), new URIImpl(ONT_BASE_URL+"is_of_type"), (Value)(new LiteralImpl(type))); + turtleWriter.handleStatement(smt); + + //out.write(String.format("<%s> <%s> \"%s\".\n", bibIdent,ONT_BASE_URL+"is_of_type",type)); + + for (Object n: bibElement.getChildren()){ + if (Element.class.isInstance(n)) { + Element e = (Element)n; + String txt=e.getTextTrim();//.replace("\"","\\\""); + smt = new StatementImpl + (new URIImpl(bibIdent), new URIImpl(ONT_BASE_URL+e.getName()), (Value)(new LiteralImpl(txt))); + turtleWriter.handleStatement(smt); + + //out.write(String.format("<%s> <%s> \"%s\".\n", bibIdent,ONT_BASE_URL+e.getName(),txt)); + } + + + } + turtleWriter.endRDF(); + +} + + static public void main(String[] args) throws IOException, RDFHandlerException{ + if (args.length!=1){ + System.out.println("Usage: transfom path"); + System.exit(1); + } + String root = args[0]; + FileWriter fw = new FileWriter("/tmp/out.rdf"); + FileWriter ew = new FileWriter("/tmp/errors_transform.txt"); + TransformIndexMeta tim = new TransformIndexMeta(fw,ew); + + Iterator it = new IndexMetaIterator(new File(root)); + while (it.hasNext()){ + String nx = it.next(); + try { + if(nx==null){ + continue; //weiss noch nicht warum das passiert. + } + tim.transform(nx); + } catch (JDOMException e) { + System.out.println("JDOM exception:"+nx); + //e.printStackTrace(); + } + fw.flush(); + ew.flush(); + } + fw.close(); + ew.close(); + } +}