# HG changeset patch
# User dwinter
# Date 1340282275 -7200
# Node ID e93de4e99b5223f8e26eab85ac8ea1d4dbcdb6f4
# Parent 6c8dac2c52145ac29d79aee42990b910f7b992ed
indexMeta2rdf in dieses Projekt verschoben
diff -r 6c8dac2c5214 -r e93de4e99b52 .classpath
--- a/.classpath Thu Jun 21 12:24:29 2012 +0200
+++ b/.classpath Thu Jun 21 14:37:55 2012 +0200
@@ -9,6 +9,7 @@
+
diff -r 6c8dac2c5214 -r e93de4e99b52 lib/jdom-1.0.jar
Binary file lib/jdom-1.0.jar has changed
diff -r 6c8dac2c5214 -r e93de4e99b52 src/de/mpiwg/itgroup/indexMeta2RDF/IndexMetaIterator.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/itgroup/indexMeta2RDF/IndexMetaIterator.java Thu Jun 21 14:37:55 2012 +0200
@@ -0,0 +1,145 @@
+package de.mpiwg.itgroup.indexMeta2RDF;
+
+
+/*
+ * Copyright 2000-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Stack;
+import java.util.Vector;
+
+
+
+import org.apache.log4j.Logger;
+import org.jdom.Document;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+
+
+/**
+ * An iterator which iterates through the contents of a java directory. The
+ * iterator should be created with the directory at the root of the Java
+ * namespace.
+ *
+ */
+public class IndexMetaIterator implements Iterator {
+
+ private File rootFolder;
+ private File currentFolder;
+ private Stack stack;
+ private ArrayListfilter; //Array of paths which shouldn'T be indexed
+
+ public IndexMetaIterator(File rootFolder) throws IOException{
+
+ filter = new ArrayList();
+ filter.add("/mpiwg/online/permanent/SudanRockArt"); // TODO: make this configurable
+
+ this.rootFolder=rootFolder;
+ this.currentFolder=rootFolder;
+ this.stack = new Stack();
+
+ for (String f:rootFolder.list()){
+ String fn = rootFolder.getCanonicalPath()+"/"+f;
+ if (!filter.contains(fn)){
+ if (!f.equals("")){ // FIXME some filesystems (sshfs?) gives empty filenames if the path contains special characters.
+ stack.push(fn);}
+ else {
+ Logger.getLogger("notAddedFilesLogger").info("Folder -" +fn+" contains files with charakters I cannot read!" );
+ }
+ }
+ }
+ }
+ @Override
+ public boolean hasNext() {
+ // TODO Auto-generated method stub
+ return !stack.isEmpty();
+ }
+
+ @Override
+ public String next() {
+ // TODO Auto-generated method stub
+ String nextFile = stack.pop();
+ while(!nextFile.endsWith(".meta") && !stack.isEmpty()){
+ System.out.println("CHECK_________"+nextFile);
+
+
+ if(!nextFile.endsWith("pageimg")){ //skip pageimg
+
+
+ File nf = new File(nextFile);
+
+
+ if(nf.isDirectory()){
+ String[] ls = nf.list();
+ if (ls==null){
+ return null;
+ }
+ for (String f:ls){
+ String fn;
+ try {
+ if (!f.startsWith(".")){
+ fn = nf.getCanonicalPath()+"/"+f;
+ if (!filter.contains(fn)){
+ if (!f.equals("")) {// FIXME some filesystems (sshfs?) gives empty filenames if the path contains special characters.
+ stack.push(fn);}
+ else {
+ Logger.getLogger("notAddedFilesLogger").info("Folder -" +fn+" contains files with characters I cannot read!" );
+ }
+
+ }
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+ }
+ }
+
+ nextFile = stack.pop();
+
+ }
+ if (!nextFile.endsWith(".meta")) //der letzte Eintrag muss noch gretrennt getestet werden.
+ nextFile = null;
+ System.out.println("FOUND:"+nextFile);
+
+ if (nextFile!=null)
+ return nextFile;
+
+ return null;
+ }
+
+
+ @Override
+ public void remove() {
+ // TODO Auto-generated method stub
+
+ }
+
+
+
+}
+
+
+
diff -r 6c8dac2c5214 -r e93de4e99b52 src/de/mpiwg/itgroup/indexMeta2RDF/TransformIndexMeta.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/itgroup/indexMeta2RDF/TransformIndexMeta.java Thu Jun 21 14:37:55 2012 +0200
@@ -0,0 +1,182 @@
+package de.mpiwg.itgroup.indexMeta2RDF;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.jdom.Attribute;
+import org.jdom.Document;
+import org.jdom.Element;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+import org.jdom.xpath.XPath;
+import org.openrdf.model.Statement;
+import org.openrdf.model.impl.LiteralImpl;
+import org.openrdf.model.impl.StatementImpl;
+import org.openrdf.model.impl.URIImpl;
+import org.openrdf.rio.RDFHandlerException;
+import org.openrdf.rio.trig.TriGWriter;
+import org.openrdf.rio.turtle.TurtleWriter;
+import org.openrdf.model.Value;
+
+
+
+
+
+public class TransformIndexMeta {
+ String OBJ_BASE_URL="http://echo.mpiwg-berlin.mpg.de/indexMeta/";
+ String ONT_BASE_URL="http://ontologies.mpiwg-berlin.mpg.de/general/MetaData/";
+ private String indexMetaType ="http://ontologies.mpiwg-berlin.mpg.de/general/IndexMeta";
+ private String bibObjType ="http://ontologies.mpiwg-berlin.mpg.de/general/BibData";
+ private FileWriter out;
+ private FileWriter error;
+ private TurtleWriter turtleWriter;
+
+
+ public TransformIndexMeta(FileWriter fw, FileWriter ew){
+ out=fw;
+ error=ew;
+ turtleWriter = new TurtleWriter(fw);
+ }
+ public void transform(String metaData) throws IOException, JDOMException, RDFHandlerException{
+ //URL url = new URL(metaData);
+ //InputStream is = url.openStream();
+
+ turtleWriter.startRDF();
+ FileInputStream is = new FileInputStream(metaData);
+ Document doc;
+ try {
+ doc = new SAXBuilder().build(is);
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ System.err.println("Cannot parse:"+metaData);
+ error.write("cannotparse:"+metaData+"\n");
+ return;
+ }
+
+ XPath xpDri = XPath.newInstance("//meta/dri[@type=\"escidoc-test\"]");
+
+ Element result = (Element)xpDri.selectSingleNode(doc);
+
+ if (result==null){
+ System.err.println("No dri for:"+metaData);
+ error.write("Non dri for:"+metaData+"\n");
+ return;
+ }
+
+ String dri=result.getTextTrim();
+
+ String objIdent=OBJ_BASE_URL+dri;
+
+
+ //out.write(String.format("<%s> rdf:type <%s>.\n", objIdent,indexMetaType));
+
+ Statement smt = new StatementImpl(new URIImpl(objIdent), new URIImpl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), new URIImpl(indexMetaType));
+ turtleWriter.handleStatement(smt);
+
+ Element resElement = doc.getRootElement();
+
+ for (Object n: resElement.getChildren()){
+ if (Element.class.isInstance(n)) {
+ Element e = (Element)n;
+
+ if (!e.getTextTrim().equals("")){
+ String txt=e.getTextTrim();//.replace("\"","\\\"");
+
+ smt = new StatementImpl
+ (new URIImpl(objIdent), new URIImpl(ONT_BASE_URL+e.getName()), (Value)(new LiteralImpl(txt)));
+ turtleWriter.handleStatement(smt);
+
+
+ //out.write(String.format("<%s> <%s> \"%s\".\n", objIdent,ONT_BASE_URL+e.getName(),txt));
+ }
+ }
+
+
+ }
+
+ XPath bib = XPath.newInstance("//meta/bib");
+
+ Element bibElement = (Element)bib.selectSingleNode(doc);
+ if (bibElement==null){
+ System.err.println("No bibelement in:"+metaData);
+ error.write("No bibelement in:"+metaData+"\n");
+ return;
+ }
+ String bibIdent=objIdent+":bib";
+
+ smt = new StatementImpl(new URIImpl(bibIdent), new URIImpl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), new URIImpl(bibObjType));
+ turtleWriter.handleStatement(smt);
+
+ //out.write(String.format("<%s> rdf:type <%s>.\n",bibIdent,bibObjType));
+
+ smt = new StatementImpl(new URIImpl(objIdent), new URIImpl(ONT_BASE_URL+"has_bibl_metaData"), new URIImpl(bibIdent));
+ turtleWriter.handleStatement(smt);
+
+ //out.write(String.format("<%s> <%s> <%s>.\n", objIdent,ONT_BASE_URL+"has_bibl_metaData",bibIdent));
+
+ Attribute bibType = bibElement.getAttribute("type");
+ if (bibType==null){
+ System.err.println("No bibtype in:"+metaData);
+ error.write("No bibtype in:"+metaData+"\n");
+ return;
+ }
+ String type=bibType.getValue();
+
+ smt = new StatementImpl
+ (new URIImpl(bibIdent), new URIImpl(ONT_BASE_URL+"is_of_type"), (Value)(new LiteralImpl(type)));
+ turtleWriter.handleStatement(smt);
+
+ //out.write(String.format("<%s> <%s> \"%s\".\n", bibIdent,ONT_BASE_URL+"is_of_type",type));
+
+ for (Object n: bibElement.getChildren()){
+ if (Element.class.isInstance(n)) {
+ Element e = (Element)n;
+ String txt=e.getTextTrim();//.replace("\"","\\\"");
+ smt = new StatementImpl
+ (new URIImpl(bibIdent), new URIImpl(ONT_BASE_URL+e.getName()), (Value)(new LiteralImpl(txt)));
+ turtleWriter.handleStatement(smt);
+
+ //out.write(String.format("<%s> <%s> \"%s\".\n", bibIdent,ONT_BASE_URL+e.getName(),txt));
+ }
+
+
+ }
+ turtleWriter.endRDF();
+
+}
+
+ static public void main(String[] args) throws IOException, RDFHandlerException{
+ if (args.length!=1){
+ System.out.println("Usage: transfom path");
+ System.exit(1);
+ }
+ String root = args[0];
+ FileWriter fw = new FileWriter("/tmp/out.rdf");
+ FileWriter ew = new FileWriter("/tmp/errors_transform.txt");
+ TransformIndexMeta tim = new TransformIndexMeta(fw,ew);
+
+ Iterator it = new IndexMetaIterator(new File(root));
+ while (it.hasNext()){
+ String nx = it.next();
+ try {
+ if(nx==null){
+ continue; //weiss noch nicht warum das passiert.
+ }
+ tim.transform(nx);
+ } catch (JDOMException e) {
+ System.out.println("JDOM exception:"+nx);
+ //e.printStackTrace();
+ }
+ fw.flush();
+ ew.flush();
+ }
+ fw.close();
+ ew.close();
+ }
+}