String

# HG changeset patch # User dwinter # Date 1319398185 -7200 # Node ID 2267d8c80a99fff32947dfb09cbd17a6f2bcede1 # Parent 90a19cbda471e7c497f4184b6a4227a21ea31ac3 intial diff -r 90a19cbda471 -r 2267d8c80a99 .classpath --- a/.classpath Wed Nov 24 16:54:52 2010 +0100 +++ b/.classpath Sun Oct 23 21:29:45 2011 +0200 @@ -8,6 +8,14 @@ + + + + + + + + diff -r 90a19cbda471 -r 2267d8c80a99 .externalToolBuilders/org.eclipse.wst.validation.validationbuilder.launch --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.externalToolBuilders/org.eclipse.wst.validation.validationbuilder.launch Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,7 @@ + + + + + + + diff -r 90a19cbda471 -r 2267d8c80a99 .project --- a/.project Wed Nov 24 16:54:52 2010 +0100 +++ b/.project Sun Oct 23 21:29:45 2011 +0200 @@ -16,8 +16,13 @@ - org.eclipse.wst.validation.validationbuilder + org.eclipse.ui.externaltools.ExternalToolBuilder + full,incremental, + + LaunchConfigHandle + <project>/.externalToolBuilders/org.eclipse.wst.validation.validationbuilder.launch + diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.apache.httpclient.jar Binary file libs/org.apache.httpclient.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.apache.httpcore.jar Binary file libs/org.apache.httpcore.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.apache.httpmime.jar Binary file libs/org.apache.httpmime.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.json.jar Binary file libs/org.json.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.restlet.ext.json.jar Binary file libs/org.restlet.ext.json.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.restlet.ext.servlet.jar Binary file libs/org.restlet.ext.servlet.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/org.restlet.jar Binary file libs/org.restlet.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 libs/xercesImpl-2.9.1.jar Binary file libs/xercesImpl-2.9.1.jar has changed diff -r 90a19cbda471 -r 2267d8c80a99 src/de/mpiwg/itgroup/metadataManager/client/MetadataClient.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/metadataManager/client/MetadataClient.java Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,123 @@ +package de.mpiwg.itgroup.metadataManager.client; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; + +import javax.xml.parsers.SAXParser; + +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.utils.URIUtils; +import org.apache.http.impl.client.DefaultHttpClient; +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; +import org.jdom.output.XMLOutputter; +import org.jdom.xpath.XPath; +import org.json.JSONObject; +import org.json.JSONTokener; + +import org.restlet.data.MediaType; +import org.restlet.representation.Representation; +import org.restlet.resource.ClientResource; + +public class MetadataClient { + + private String serverUrl; + + public MetadataClient(String string) { + serverUrl = string; + + } + + public String replaceBIB(String idOfObject, String newBibTag, boolean save) throws URISyntaxException, ClientProtocolException, IOException { + + String newIM=null; + SAXBuilder sb = new SAXBuilder(); + + String url = serverUrl+"indexMeta/permanent/library/"+idOfObject; + + HttpClient httpclient = new DefaultHttpClient(); + + HttpGet httpget = new HttpGet(new URI(url)); + + HttpResponse response = httpclient.execute(httpget); + HttpEntity entity = response.getEntity(); + + try { + if (entity != null) { + InputStream instream = (InputStream) entity.getContent(); + Document dom = sb.build(instream); + + XPath xp =XPath.newInstance("//bib"); + Element bibNode = (Element)xp.selectSingleNode(dom); + + Document doc2 = sb.build(new StringReader(newBibTag)); + Element newBibRoot=doc2.getRootElement(); + + Element bibPar=(Element) bibNode.getParent(); + bibPar.removeContent(bibNode); + bibPar.addContent((Element) newBibRoot.clone()); + + XMLOutputter op = new XMLOutputter(); + newIM = op.outputString(dom); + if (save){ + //TODO save the new XML + } + } + } catch (IllegalStateException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (JDOMException e) { + + System.err.println("Can't handle:"+ idOfObject); + e.printStackTrace(); + + } + return newIM; + } + + public String addContext(String indexMeta, String link, String name, Boolean save) throws IOException { + SAXBuilder sb = new SAXBuilder(); + InputStream im = new ByteArrayInputStream(indexMeta.getBytes("utf-8")); + try { + Document dom =sb.build(im); + + XPath xp = XPath.newInstance("//meta"); + Element metaNode=(Element) xp.selectSingleNode(dom); + + Element contextElement = new Element("context"); + + Element linkElement = new Element("link"); + linkElement.setText(link); + Element nameElement = new Element("name"); + nameElement.setText(name); + contextElement.addContent(linkElement); + contextElement.addContent(nameElement); + + metaNode.addContent(contextElement); + + XMLOutputter op = new XMLOutputter(); + String newIM = op.outputString(dom); + + if (save){ + //TODO save the new XML + } + return newIM; + } catch (JDOMException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return null; + } + } + +} diff -r 90a19cbda471 -r 2267d8c80a99 src/de/mpiwg/itgroup/metadataManager/indexMeta/server/IndexMetaProvider.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/metadataManager/indexMeta/server/IndexMetaProvider.java Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,68 @@ +/** + * + */ +package de.mpiwg.itgroup.metadataManager.indexMeta.server; + +import java.io.File; +import java.io.InputStream; + +import org.apache.log4j.Logger; +import org.restlet.data.Form; +import org.restlet.data.MediaType; +import org.restlet.data.Status; +import org.restlet.representation.FileRepresentation; +import org.restlet.representation.InputRepresentation; +import org.restlet.representation.Representation; +import org.restlet.representation.StringRepresentation; +import org.restlet.resource.Get; +import org.restlet.resource.Options; +import org.restlet.resource.ServerResource; + +/** + * @author dwinter + * + */ +public class IndexMetaProvider extends ServerResource { + + Logger logger = Logger.getRootLogger(); + String basePermanentPath="/Volumes/online_permanent/"; + +/** + * Erlaubt cross scripting bei Aufruf aus Javascript + * @param entity + */ +@Options +public void doOptions(Representation entity) { + Form responseHeaders = (Form) getResponse().getAttributes().get("org.restlet.http.headers"); + if (responseHeaders == null) { + responseHeaders = new Form(); + getResponse().getAttributes().put("org.restlet.http.headers", responseHeaders); + } + responseHeaders.add("Access-Control-Allow-Origin", "*"); + responseHeaders.add("Access-Control-Allow-Methods", "POST,OPTIONS,GET"); + responseHeaders.add("Access-Control-Allow-Headers", "Content-Type"); + responseHeaders.add("Access-Control-Allow-Credentials", "false"); + responseHeaders.add("Access-Control-Max-Age", "60"); +} + +@Get("xml") +public Representation getXML(){ + logger.debug("getIndexMeta"); + String restPath = getRequest().getResourceRef().getRemainingPart(); + + String newpath=restPath.replace("/permanent/", basePermanentPath); + + String indexMetaStr=newpath+"/index.meta"; + + File indexMetaFile = new File(indexMetaStr); + + if (!indexMetaFile.exists()){ + getResponse().setStatus(Status.CLIENT_ERROR_NOT_FOUND); + logger.debug("file not found"); + return new StringRepresentation("Can't find:"+indexMetaStr); + } + + return new FileRepresentation(indexMetaStr, MediaType.TEXT_XML); + +} +} diff -r 90a19cbda471 -r 2267d8c80a99 src/de/mpiwg/itgroup/metadataManager/validation/IndexMetaValidator.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/metadataManager/validation/IndexMetaValidator.java Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,92 @@ +package de.mpiwg.itgroup.metadataManager.validation; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; + +import org.apache.log4j.Logger; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +public class IndexMetaValidator { + + private Logger logger = Logger.getRootLogger(); + + public static boolean validate(String indexMeta) throws UnsupportedEncodingException{ + IndexMetaValidator iv = new IndexMetaValidator(); + InputStream is = new ByteArrayInputStream(indexMeta.getBytes("utf-8")); + + String schemaUrl="/Users/dwinter/Documents/Projekte/ECHO-eSciDoc-MPDL/escidocMPIWG/MetaDataManager/src/de/mpiwg/itgroup/metadataManager/validation/data/index_meta.xsd"; + + iv.validateSchema(schemaUrl, is); + return false;} + + + + public boolean validateSchema(String SchemaUrl, InputStream xmlDocumentStream) { + try { //Create SAXBuilder object + SAXBuilder saxBuilder = new SAXBuilder( + "org.apache.xerces.parsers.SAXParser", true); + + //Set SAXBuilder parser to be a validating parser + saxBuilder.setValidation(true); + saxBuilder.setFeature( + "http://apache.org/xml/features/validation/schema", true); + saxBuilder.setFeature( + "http://apache.org/xml/features/validation/schema-full-checking",true); + saxBuilder.setProperty( + "http://apache.org/xml/properties/schema/external-noNamespaceSchemaLocation",SchemaUrl); + + //Create a ErrorHandler and set ErrorHandler on parser. + Validator handler = new Validator(); + saxBuilder.setErrorHandler(handler); + //Parse XML Document + saxBuilder.build(xmlDocumentStream); + //Output Validation Errors + if (handler.validationError == true){ + logger.debug("XML Document has Error:" + + handler.validationError + " " + + handler.saxParseException.getMessage()); + return false;} + else{ + logger.debug("XML Document is valid"); + return true;} + + } catch (JDOMException jde) { + logger.debug(jde); + } + + catch (IOException ioe) { + } + return false; + + } + + //Error Handler class + private class Validator extends DefaultHandler { + public boolean validationError = false; + + public SAXParseException saxParseException = null; + + public void error(SAXParseException exception) throws SAXException { + validationError = true; + saxParseException = exception; + } + + public void fatalError(SAXParseException exception) throws SAXException { + validationError = true; + saxParseException = exception; + } + + public void warning(SAXParseException exception) throws SAXException { + } + } + + + +} diff -r 90a19cbda471 -r 2267d8c80a99 src/de/mpiwg/itgroup/metadataManager/validation/data/2011-05-04_index_meta.xsd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/metadataManager/validation/data/2011-05-04_index_meta.xsd Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,1345 @@ + + + + + + Comment describing your root element + + + + + + In this description elements marked “optional” need not be supplied by the provider of the resource and may be absent in all versions of the metadata file. Elements marked “required” must be supplied by the provider of the resource. Elements marked “deduced” can be supplied by the provider of the resource but can also be provided by automatic scripts later in the process, these elements must be present in the final file. File and directory paths in the metadata file use the conventional Unix file separator slash “/”. The outer container element is resource. +type is sub-type of resource (e.g. “ECHO”, “MPIWG”) +version is version number of metadata format (currently 1.2) + + + + + + An informal textual description of the resource (At least one description of the resource’s content is required. The description can be an informal description element or a descriptive element (like bib) in a meta container. ) + + + + + The filename of the resource (name of the directory this file is contained in) + + + + + The name of the pro ject or person that created the resource + + + + + The time and date the archive collection was created +– deduced. + + + + + The time and date the archive was written to permanent storage – deduced (must not be set by the user). + + + + + The full path to the resource directory inside the whole archive collection, including the resource directory – deduced. + + + + + The ID for this document in the archive + + + + + + Container for the description of the original resource if this resource is a modified version of another resource + + + + + + + The ID of the original resource + + + + + The full path to the original resource + + + + + + An informal textual description of the relation of this resource to the original resource + + + + + + + + Container for the description of modified resources if this resource is the source of another resource + + + + + + + The ID of the derived resource + + + + + The full path to the derived resource + + + + + + An informal textual description of the relation of this resource to the original resource + + + + + + + + Container for the description of another resource when this resource is a linked copy of another resource + + + + + + + The ID of the linked resource + + + + + The full path to the linked resource + + + + + + An informal textual description of the relation of this resource to the linked resource + + + + + + + + Container for the description of another resource if this resource is a part of the other resource. It can have a type attribute describing the type of relation. e.g. "manuscript-codex". + + + + + + + The ID of the original resource + + + + + The full path to the original resource + + + + + + An informal textual description of the relation of this resource to the original resource + + + + + + + + + + The main media type of this resource. +The main media type can be overridden by media-types in subdirectories. +Possible types are: image, text, audio, video and data for other type of data + + + + + + + + + + + + + + Additional metadata information about the resource + + + + + Container for the description of a subdirectory (when there are subdirectories). dir tags should not be nested. Directories at lower levels are identified by their path. + + + + + + The name of the subdirectory + + + + + An informal textual description of the subdirectory + + + + + A text string associated with the directory as original name. (E.g. if the data in this directory came from an external source and had a name that had to be changed according to section 1 but it should be possible to reference the original name.) + + + + + The directory path of this subdirectory relative to the resource’s root directory (excluding the directory itself ). (may be +empty or omitted if the directory is a direct child of the resource’s +root directory). + + + + + Additional metadata information about the directory + + + + + + + + Container for the description of a file – deduced. +file tags should not be nested in dir tags. Files at lower directory levels are identified by their path. + + + + + + The name of the file + + + + + An informal textual description of the file + + + + + A text string associated with the file as original name. (e.g. if this file came from an external source and had a name that had to be changed according to section 1 it is possible to preserve the original name.) + + + + + The directory path of this file relative to the resource's root directory (excluding the file itself). (may be empty or omitted if the file is in the resource’s root directory). + + + + + The file’s modification or creation date, whichever is more recent + + + + + The file’s modification date + + + + + The file’s creation date + + + + + The file size – deduced. + + + + + The file’s mime-type + + + + + MD5 checksum of the file content + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All additional metadata elements can have a workflow-state attribute. This attribute reflects the state of the corresponding metadata element. The possible values for the workflow-state attribute are +• preliminary this information is preliminary. It must be checked in further workflow steps. +• inwork +• final +workflow states other than preliminary are part of the workflow handling of the respective projects. +Metadata elements can appear multiple times with different workflow-state-attributes. This enables metadata versioning. + + + + + + The content type of this resource. The content type enables the choice of tools to manipulate and display the resource. There should be a common list of content types. For digital documents (books, manuscripts) this would be “scanned document”, for other image data “scanned images”. The criterion for documents is an ordered succession of image files (pages) and equal image size and resolution throughout the images of a resource. + + + + + The language of a resource (e.g. a text) can be specified with a lang tag. Languages have to be described using the international codes for the representation of names of languages either in two-letter form (ISO 639-1) or in three-letter form (ISO 639-2). The entire catalogue of languages is documented on the page http://www.loc.gov/standards/iso639- 2/englangn.html + + + + + The digital resource identifier for the resource is specified in a dri element. Digital resource identifiers are documented on the page http://pythia.mpiwg-berlin.mpg.de/projects/standards/dri. + + + + + The context of a resource as part of a collection or part of a pro ject can be specified in the context element. The context element can appear multiple times if the resource is part of multiple collections or pro jects. + + + + + + URL to additional context information + + + + + Textual description of pro ject or collection + + + + + description of external sources of canonical meta information +- db attribute to identify different sets of meta data links to the same resource +- object attribute to identify different objects or parts of the same resource + + + + + + textual label for the link + + + + + + URL to an external server to be queried + + + + + + + + + + description of external server for canonical meta information. +- db attribute to identify different sets of meta data links to the same resource + + + + + + textual label for the link + + + + + + URL to an external server to be queried +(the parameter object= with an object id has to be appended to this URL) + + + + + + + + + + + + + + + + + + + Comment describing your root element + + + + + + + Name of the academic department where the thesis was handed in. + + + + + Alternate journal + + + + + + + + + Quire signatures and catchwords + + + + + + Notes on collation and corrections. + + + + + City where the conference was held. + + + + + Name of the conference the proceedings are related to. + + + + + + Copyist + + + + + + the date in its original form as noted on the letter + + + + + end of range of uncertain dating + + + + + + Height and width in cm + + + + + + + + + + + + Institution where the report was produced. + + + + + + + Date of the issue the article is part of. Only in bib type="newspaper-article" + + + + + Name of the journal + + + + + + Number of lines and columns. + + + + + + + Name of the magazine. + + + + + Name of the newspaper the article appeared in. + + + + + + + + + + Height and width of page in cm. + + + + + + + The recipient of the letter. + + + + + Report number + + + + + Description of the script and the ink used. + + + + + Notes on secondary literature related to the manuscript + + + + + + + + + + Name of the university where the thesis was handed in. + + + + + + material of the writing surface (e.g. “non-european paper”, “palm leaf ”,. . . ) + + + + + Height and width of written area in cm. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Specific information for architectural drawings is presented in a doc container +with an additional type attribute giving the type of drawing. All elements inside +the container can appear multiple times. + + + + + + last name and first name of a person, separated by a comma. A further common name for the person can be put infront, separated by a semicolon. + + + + + Name of a place in its common notation. This can be a city or a institution. + + + + + This can be a year (or several years, separated by commas) or a period (1706-1714). Years are noted with four digits. + + + + + Short description of an object or signatures. + + + + + + + + + + + + + + Information on the structure of a document like the division into parts and chapters in the way of a table of contents is presented in a toc container. +The scheme allows multiple logical pages on a single page image as it is often the case with scanned books or manuscripts. The scheme also allows for “loose” numbering schemes with roman, arabic or other page numbers consecutively or mixed and changes in the numbering within the document. +The flexibility comes from the fact that no additional assumptions about the +mapping between logical pages and page images are made in the format. All mapping information is specified by the user. +The logical page numbering or naming that can be presented to the user is +specified in the name tags while the physical numbering of the page images is specified in the index or url tags. + + + + + + describes a single logical page + + + + + + the “name” of the logical page. This can be any string like a page number (arabic, roman, etc.) or a special designation like “Table 5”. + + + + + the digilib index number of the scan image of the page. (The index number for digilib is the index in the alphabetical order of the scan file names.) + + + + + alternatively to the digilib index number the full URL of the scan image of the page can be used. + + + + + + + + + + + + + + Image files representing scanned images can have an img container tag with +information about the scan resolution and the size of the original image. This +information is used by the digilib image viewing tool. +Required is one of three possible sets of tags: + + + + + The width of the original image. +The unit of measure can be contained as parameter unit, the default is meter “m”. The width to be considered is the total width of the scanned area. + + + + + The height of the original image. + + + + + The width of the hi-res scan in pixels. + + + + + The height of the hi-res scan in pixels + + + + + + + The resolution of the hi-res scan in pixels per inch if the +resolutions in width and height are the same + + + + + The width of the hi-res scan in pixels - deduced. + + + + + The height of the hi-res scan in pixels – deduced. + + + + + + + The resolution of the hi-res scan in its width in pixels per +inch + + + + + The resolution of the hi-res scan in its height in pixels per +inch + + + + + The width of the hi-res scan in pixels - deduced. + + + + + The height of the hi-res scan in pixels – deduced. + + + + + + + + + A description of the technology used in the process of producing a digital image. + + + + + + acquisition device (e.g. “flatbed scanner”) + + + + + type and color-depth of the image (e.g. “RGB 24 bit”) + + + + + additional textual information about the production process + + + + + + + + Full text in a XML format should be specified with a content-type “fulltext”. +The relation between the full text and optional images of whole pages or parts of pages must be specified in a texttool container. + + + + + + + + + + + + + + the file name of the full text file (path inside document directory) + + + + + the directory name of the directory containig the page image files (path inside document directory) + + + + + the directory name of the directory containig the in-page figure image files (path inside document directory) + + + + + a characteristic part of the URL with which the full text can be retrieved (the form and content of this element is dependent on the specific text retrieval mechanism) + + + + + the file name of an additional XSL transformation file + + + + + the name of the element that indicates page breaks (default “pb”) + + + + + + + + + + + + If the access to a resource is bound to conditions for technical or legal reasons then the conditions can be put in a access-conditions container. Other usage conditions like copyright can also be documented in this container. +The attribution, copyright, and access tags can be repeated with different resource attributes if different conditions apply to different parts of the whole resource. + + + + + + The name or institution this resource should be attributed +to when it’s publicly presented. +The kind of resource this condition applies to can be specified with a resource attribute with the values “original” (the physical object that was scanned), "digital-image" (the scanned images), "text" (the textual transcript). + + + + + + + + + + + + + + + + + + + + + + + + + + + the copyright holder and the copyright conditions. +The kind of resource this condition applies to can be specified with a resource attribute with the values “original” (the physical object that was scanned), “digital-image” (the scanned images), “text” (the textual transcript). + + + + + + the name of the copyright holder + + + + + + a name (free text) + + + + + + + + + + the duration of the copyright term (if known) + + + + + + the type of license if its a standardised license e.g. Creative Commons + + + + + a URL representing the license e.g. http://creativecommons.org/licenses/by/3.0/ + + + + + + + + + + + + + + + + + + + + access restricted to the members of this named group. The method to identify a user belonging to a named group is not +specified in this document. + + + + + conditions of access to this resource. Different access types are specified by a type attribute. The kind of resource this condition applies to can be specified with a resource attribute with the values “digital-image” (the scanned images), or “text” (the textual transcript). + + + + name of the group. + + + + + subnet range defined in truncated-quad (e.g. “141.14”), network-netmask (e.g. “141.14.0.0/255.255.0.0”), or network-range (e.g. “141.14.0.0/16”) notation. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the acquisition source of this resource + + + + + + where this resource came from + + + + + + free-text name of the provider (institution or individual) + + + + + address of the provider + + + + + contact person at the provider (i.e. name and email) + + + + + + id of the provider (internally used) - deduced + + + + + + + + + + + + + Documentary films can be described using a film-acquisition container. +(More information about the digitization step could be added in a digitization tag similar to the recording tag.) + + + + + + + + + + + the place where the film was recorded + + + + + recording device used (e.g. “Sony CP-DV8 Camcorder”) + + + + + format of the recorded film (e.g. “DV 720x524 +25fps interlaced”) + + + + + + + + + + + + + + + + Keywords related to the object/manuscript etc. + + + + + describes a section or chapter of the text. chapter elements can be nested. + + + + + + the title of the chapter or section. + + + + + the beginning of a page range (usually the first page of the chapter). The start element has an optional increment attribute to indicate the number of logical pages on a scan image. (This information is only needed by additional tools that try to generate lists of all page and image numbers.) + + + + + + the “name” of the first page + + + + + the index of the first page + + + + + the URL of the first page + + + + + + + + + the end of a page range (usually the last page of the chapter). + + + + + + the “name” of the last page + + + + + the index of the last page + + + + + the URL of the last page + + + + + + + + alternative (and additional) to start/end page ranges single page elements can be used inside chapter. + + + + + + + + + The author of the book/article/thesis etc. +The author/sender of a letter. +The person(s) doing the recording. + + + + + Title of the book/article/thesis etc. + + + + + The year of publication. +- approximate year or century. + + + + + Title of the serie, if the book appears in a series. + + + + + Volume number, if the book appears in a series. + + + + + Number of pages of the entire book/volume. + + + + + City where the book/journal/thesis etc. was published. +City of the newspaper. + + + + + Name of the publishing company. + + + + + Edition of the book/journal (e.g. third edition) + + + + + Name of the translator + + + + + + Call number in holding library + + + + + Holding library + + + + + Name of the series editor, if the book appears in a series. + + + + + Number of volumes, if the book is published in multiple volumes. + + + + + Name of the book’s editor. + + + + + Number of pages of the article + + + + + Title of the book if bib type=inbook + + + + + Volume number + + + + + The date of publication with attribute which calendar used. If no attribute used, CE is the default. Can also be descriptive. +- normalised date of the letter +- Date of the collation of the codex. +- Date of the conference the proceedings are related to +- Date when the article appeared. +- Date when the copyright was issued. +- Date of acquisition +- Date or time span when the film was recorded + + + + + + + + + + + + Number of the issue the article is part of. + + + + + Signature(s) of the manuscript under which a manuscript is known. + + + + + Remarks related to the online publication of the manuscript. +This could be notes about annotations etc. + + + + + Name of the library/place/city/country where the manuscript is currently located. + + + + + Text giving list or range of folios. + + + + + Number of folios/pages of the manuscript. + + + + + Formal description of the text structure (e.g. table of contents). + + + + + Description of binding. + + + + + Notes on ownership of the manuscript. + + + + + Additional notes + + + + + Incipit (beginning of text). +The opening phrase of the letter + + + + + Explicit (end of text). +The closing phrase of the letter + + + + + This could be any kind of description. + + + + + Type of the report. The type of correspondence, e.g. “letter”, “postcard”, “telegram”, +“letter draft” + + + + + Interpretative abstract of the text's content. + + + + + URL to present to the client +- alternatively to the digilib index number the full URL of the scan image of the page can be used. + + + + + + + + + + + + the access condition is only valid before the given date (format: “YYYY/MM/DD”). + + + + + the access condition is only valid after the given date (format: “YYYY/MM/DD”). + + + diff -r 90a19cbda471 -r 2267d8c80a99 src/de/mpiwg/itgroup/metadataManager/validation/data/index_meta.xsd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/metadataManager/validation/data/index_meta.xsd Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,1534 @@ + + + + + + Comment describing your root element + + + + + + In this description elements marked “optional” need not be supplied by the provider of the resource and may be absent in all versions of the metadata file. Elements marked “required” must be supplied by the provider of the resource. Elements marked “deduced” can be supplied by the provider of the resource but can also be provided by automatic scripts later in the process, these elements must be present in the final file. File and directory paths in the metadata file use the conventional Unix file separator slash “/”. The outer container element is resource. +type is sub-type of resource (e.g. “ECHO”, “MPIWG”) +version is version number of metadata format (currently 1.2) + + + + + + An informal textual description of the resource (At least one description of the resource’s content is required. The description can be an informal description element or a descriptive element (like bib) in a meta container. ) + + + + + The filename of the resource (name of the directory this file is contained in) + + + + + The name of the pro ject or person that created the resource + + + + + The time and date the archive collection was created +– deduced. + + + + + The time and date the archive was written to permanent storage – deduced (must not be set by the user). + + + + + The full path to the resource directory inside the whole archive collection, including the resource directory – deduced. + + + + + The ID for this document in the archive + + + + + + Container for the description of the original resource if this resource is a modified version of another resource + + + + + + + The ID of the original resource + + + + + The full path to the original resource + + + + + + An informal textual description of the relation of this resource to the original resource + + + + + + + + Container for the description of modified resources if this resource is the source of another resource + + + + + + + The ID of the derived resource + + + + + The full path to the derived resource + + + + + + An informal textual description of the relation of this resource to the original resource + + + + + + + + Container for the description of another resource when this resource is a linked copy of another resource + + + + + + + The ID of the linked resource + + + + + The full path to the linked resource + + + + + + An informal textual description of the relation of this resource to the linked resource + + + + + + + + Container for the description of another resource if this resource is a part of the other resource. It can have a type attribute describing the type of relation. e.g. "manuscript-codex". + + + + + + + The ID of the original resource + + + + + The full path to the original resource + + + + + + An informal textual description of the relation of this resource to the original resource + + + + + + + + + + The main media type of this resource. +The main media type can be overridden by media-types in subdirectories. +Possible types are: image, text, audio, video and data for other type of data + + + + + + + + + + + + + + Additional metadata information about the resource + + + + + Container for the description of a subdirectory (when there are subdirectories). dir tags should not be nested. Directories at lower levels are identified by their path. + + + + + + The name of the subdirectory + + + + + An informal textual description of the subdirectory + + + + + A text string associated with the directory as original name. (E.g. if the data in this directory came from an external source and had a name that had to be changed according to section 1 but it should be possible to reference the original name.) + + + + + The directory path of this subdirectory relative to the resource’s root directory (excluding the directory itself ). (may be +empty or omitted if the directory is a direct child of the resource’s +root directory). + + + + + Additional metadata information about the directory + + + + + + + + Container for the description of a file – deduced. +file tags should not be nested in dir tags. Files at lower directory levels are identified by their path. + + + + + + The name of the file + + + + + An informal textual description of the file + + + + + A text string associated with the file as original name. (e.g. if this file came from an external source and had a name that had to be changed according to section 1 it is possible to preserve the original name.) + + + + + The directory path of this file relative to the resource's root directory (excluding the file itself). (may be empty or omitted if the file is in the resource’s root directory). + + + + + The file’s modification or creation date, whichever is more recent + + + + + The file’s modification date + + + + + The file’s creation date + + + + + The file size – deduced. + + + + + The file’s mime-type + + + + + MD5 checksum of the file content + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All additional metadata elements can have a workflow-state attribute. This attribute reflects the state of the corresponding metadata element. The possible values for the workflow-state attribute are +• preliminary this information is preliminary. It must be checked in further workflow steps. +• inwork +• final +workflow states other than preliminary are part of the workflow handling of the respective projects. +Metadata elements can appear multiple times with different workflow-state-attributes. This enables metadata versioning. + + + + + + The content type of this resource. The content type enables the choice of tools to manipulate and display the resource. There should be a common list of content types. For digital documents (books, manuscripts) this would be “scanned document”, for other image data “scanned images”. The criterion for documents is an ordered succession of image files (pages) and equal image size and resolution throughout the images of a resource. + + + + + The language of a resource (e.g. a text) can be specified with a lang tag. Languages have to be described using the international codes for the representation of names of languages either in two-letter form (ISO 639-1) or in three-letter form (ISO 639-2). The entire catalogue of languages is documented on the page http://www.loc.gov/standards/iso639- 2/englangn.html + + + + + The digital resource identifier for the resource is specified in a dri element. Digital resource identifiers are documented on the page http://pythia.mpiwg-berlin.mpg.de/projects/standards/dri. + + + + + The context of a resource as part of a collection or part of a pro ject can be specified in the context element. The context element can appear multiple times if the resource is part of multiple collections or pro jects. + + + + + + URL to additional context information + + + + + Textual description of pro ject or collection + + + + + description of external sources of canonical meta information +- db attribute to identify different sets of meta data links to the same resource +- object attribute to identify different objects or parts of the same resource + + + + + + textual label for the link + + + + + + URL to an external server to be queried + + + + + + + + + + description of external server for canonical meta information. +- db attribute to identify different sets of meta data links to the same resource + + + + + + textual label for the link + + + + + + URL to an external server to be queried +(the parameter object= with an object id has to be appended to this URL) + + + + + + + + + + + + + + + + + + + a published book. + + + + + Bibliographic information is presented in a bib container with a type parameter, giving the type of bibliographic resource. The type field can be repeated as a tag in the container. +The format is based on the ECHO scheme for bibliographic data (cf. content workflow), the MPIWG “Projektbibliografie” and the format of the commonly used program “EndNote”. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Name of the conference the proceedings are related to. + + + + + + + + City where the conference was held. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Name of the journal + + + + + Alternate journal + + + + + + + + + + + + + + + + Name of the magazine. + + + + + + + + + + + + + + + + Name of the newspaper the article appeared in. + + + + + + Date of the issue the article is part of. Only in bib type="newspaper-article" + + + + + + + + + + + + + Name of the academic department where the thesis was handed in. + + + + + Name of the university where the thesis was handed in. + + + + + + + + + + + + + + + + + + Institution where the report was produced. + + + + + + Report number + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Notes on collation and corrections. + + + + + + + + + material of the writing surface (e.g. “non-european paper”, “palm leaf ”,. . . ) + + + + + Height and width of page in cm. + + + + + Height and width of written area in cm. + + + + + Number of lines and columns. + + + + + Quire signatures and catchwords + + + + + Description of the script and the ink used. + + + + + Copyist + + + + + + + + Notes on secondary literature related to the manuscript + + + + + + + + + + + + + + Height and width in cm + + + + + + + + + + + + + + + + The recipient of the letter. + + + + + + + end of range of uncertain dating + + + + + the date in its original form as noted on the letter + + + + + place where the letter was written/sent. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Specific information for architectural drawings is presented in a doc container +with an additional type attribute giving the type of drawing. All elements inside +the container can appear multiple times. + + + + + + last name and first name of a person, separated by a comma. A further common name for the person can be put infront, separated by a semicolon. + + + + + Name of a place in its common notation. This can be a city or a institution. + + + + + This can be a year (or several years, separated by commas) or a period (1706-1714). Years are noted with four digits. + + + + + Short description of an object or signatures. + + + + + + + + + + + + + + Information on the structure of a document like the division into parts and chapters in the way of a table of contents is presented in a toc container. +The scheme allows multiple logical pages on a single page image as it is often the case with scanned books or manuscripts. The scheme also allows for “loose” numbering schemes with roman, arabic or other page numbers consecutively or mixed and changes in the numbering within the document. +The flexibility comes from the fact that no additional assumptions about the +mapping between logical pages and page images are made in the format. All mapping information is specified by the user. +The logical page numbering or naming that can be presented to the user is +specified in the name tags while the physical numbering of the page images is specified in the index or url tags. + + + + + + describes a single logical page + + + + + + the “name” of the logical page. This can be any string like a page number (arabic, roman, etc.) or a special designation like “Table 5”. + + + + + the digilib index number of the scan image of the page. (The index number for digilib is the index in the alphabetical order of the scan file names.) + + + + + alternatively to the digilib index number the full URL of the scan image of the page can be used. + + + + + + + + + + + + + + Image files representing scanned images can have an img container tag with +information about the scan resolution and the size of the original image. This +information is used by the digilib image viewing tool. +Required is one of three possible sets of tags: + + + + + The width of the original image. +The unit of measure can be contained as parameter unit, the default is meter “m”. The width to be considered is the total width of the scanned area. + + + + + The height of the original image. + + + + + The width of the hi-res scan in pixels. + + + + + The height of the hi-res scan in pixels + + + + + + + The resolution of the hi-res scan in pixels per inch if the +resolutions in width and height are the same + + + + + The width of the hi-res scan in pixels - deduced. + + + + + The height of the hi-res scan in pixels – deduced. + + + + + + + The resolution of the hi-res scan in its width in pixels per +inch + + + + + The resolution of the hi-res scan in its height in pixels per +inch + + + + + The width of the hi-res scan in pixels - deduced. + + + + + The height of the hi-res scan in pixels – deduced. + + + + + + + + + A description of the technology used in the process of producing a digital image. + + + + + + acquisition device (e.g. “flatbed scanner”) + + + + + type and color-depth of the image (e.g. “RGB 24 bit”) + + + + + additional textual information about the production process + + + + + + + + Full text in a XML format should be specified with a content-type “fulltext”. +The relation between the full text and optional images of whole pages or parts of pages must be specified in a texttool container. + + + + + + + + + + + + + + the file name of the full text file (path inside document directory) + + + + + the directory name of the directory containig the page image files (path inside document directory) + + + + + the directory name of the directory containig the in-page figure image files (path inside document directory) + + + + + a characteristic part of the URL with which the full text can be retrieved (the form and content of this element is dependent on the specific text retrieval mechanism) + + + + + the file name of an additional XSL transformation file + + + + + the name of the element that indicates page breaks (default “pb”) + + + + + + + + + + + + If the access to a resource is bound to conditions for technical or legal reasons then the conditions can be put in a access-conditions container. Other usage conditions like copyright can also be documented in this container. +The attribution, copyright, and access tags can be repeated with different resource attributes if different conditions apply to different parts of the whole resource. + + + + + + The name or institution this resource should be attributed +to when it’s publicly presented. +The kind of resource this condition applies to can be specified with a resource attribute with the values “original” (the physical object that was scanned), "digital-image" (the scanned images), "text" (the textual transcript). + + + + + + + + + + + + + + + + + + + + + + + + + + + the copyright holder and the copyright conditions. +The kind of resource this condition applies to can be specified with a resource attribute with the values “original” (the physical object that was scanned), “digital-image” (the scanned images), “text” (the textual transcript). + + + + + + the name of the copyright holder + + + + + + a name (free text) + + + + + + + + + + the duration of the copyright term (if known) + + + + + + the type of license if its a standardised license e.g. Creative Commons + + + + + a URL representing the license e.g. http://creativecommons.org/licenses/by/3.0/ + + + + + + + + + + + + + + + + + + + + access restricted to the members of this named group. The method to identify a user belonging to a named group is not +specified in this document. + + + + + conditions of access to this resource. Different access types are specified by a type attribute. The kind of resource this condition applies to can be specified with a resource attribute with the values “digital-image” (the scanned images), or “text” (the textual transcript). + + + + + name of the group. + + + + + + + + + name of the group. + + + + + + + + + subnet range defined in truncated-quad (e.g. “141.14”), network-netmask (e.g. “141.14.0.0/255.255.0.0”), or network-range (e.g. “141.14.0.0/16”) notation. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the acquisition source of this resource + + + + + + where this resource came from + + + + + + free-text name of the provider (institution or individual) + + + + + address of the provider + + + + + contact person at the provider (i.e. name and email) + + + + + + id of the provider (internally used) - deduced + + + + + + + + + + + + + Documentary films can be described using a film-acquisition container. +(More information about the digitization step could be added in a digitization tag similar to the recording tag.) + + + + + + + + + + + the place where the film was recorded + + + + + recording device used (e.g. “Sony CP-DV8 Camcorder”) + + + + + format of the recorded film (e.g. “DV 720x524 +25fps interlaced”) + + + + + + + + + + + + + + + + Keywords related to the object/manuscript etc. + + + + + describes a section or chapter of the text. chapter elements can be nested. + + + + + + the title of the chapter or section. + + + + + the beginning of a page range (usually the first page of the chapter). The start element has an optional increment attribute to indicate the number of logical pages on a scan image. (This information is only needed by additional tools that try to generate lists of all page and image numbers.) + + + + + + the “name” of the first page + + + + + the index of the first page + + + + + the URL of the first page + + + + + + + + + the end of a page range (usually the last page of the chapter). + + + + + + the “name” of the last page + + + + + the index of the last page + + + + + the URL of the last page + + + + + + + + alternative (and additional) to start/end page ranges single page elements can be used inside chapter. + + + + + + + + + The author of the book/article/thesis etc. +The author/sender of a letter. +The person(s) doing the recording. + + + + + Title of the book/article/thesis etc. + + + + + The year of publication. +- approximate year or century. + + + + + Title of the serie, if the book appears in a series. + + + + + Volume number, if the book appears in a series. + + + + + Number of pages of the entire book/volume. + + + + + City where the book/journal/thesis etc. was published. +City of the newspaper. + + + + + Name of the publishing company. + + + + + Edition of the book/journal (e.g. third edition) + + + + + Name of the translator + + + + + + Call number in holding library + + + + + Holding library + + + + + Name of the series editor, if the book appears in a series. + + + + + Number of volumes, if the book is published in multiple volumes. + + + + + Name of the book’s editor. + + + + + Number of pages of the article + + + + + Title of the book if bib type=inbook + + + + + Volume number + + + + + The date of publication with attribute which calendar used. If no attribute used, CE is the default. Can also be descriptive. +- normalised date of the letter +- Date of the collation of the codex. +- Date of the conference the proceedings are related to +- Date when the article appeared. +- Date when the copyright was issued. +- Date of acquisition +- Date or time span when the film was recorded + + + + + + + + + + + + Number of the issue the article is part of. + + + + + Signature(s) of the manuscript under which a manuscript is known. + + + + + Remarks related to the online publication of the manuscript. +This could be notes about annotations etc. + + + + + Name of the library/place/city/country where the manuscript is currently located. + + + + + Text giving list or range of folios. + + + + + Number of folios/pages of the manuscript. + + + + + Formal description of the text structure (e.g. table of contents). + + + + + Description of binding. + + + + + Notes on ownership of the manuscript. + + + + + Additional notes + + + + + Incipit (beginning of text). +The opening phrase of the letter + + + + + Explicit (end of text). +The closing phrase of the letter + + + + + This could be any kind of description. + + + + + Type of the report. The type of correspondence, e.g. “letter”, “postcard”, “telegram”, +“letter draft” + + + + + Interpretative abstract of the text's content. + + + + + URL to present to the client +- alternatively to the digilib index number the full URL of the scan image of the page can be used. + + + + + + + + + + + + the access condition is only valid before the given date (format: “YYYY/MM/DD”). + + + + + the access condition is only valid after the given date (format: “YYYY/MM/DD”). + + + diff -r 90a19cbda471 -r 2267d8c80a99 src/de/mpiwg/itgroup/metadataManager/validation/data/sample-index-meta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/metadataManager/validation/data/sample-index-meta.xml Sun Oct 23 21:29:45 2011 +0200 @@ -0,0 +1,435 @@ + + + + + String + String + String + 1967-08-13 + 1967-08-13 + String + String + + String + String + + image + + String + en-us + String + + http://www.altova.com + String + + String + http://www.altova.com + http://www.altova.com + + + String + http://www.altova.com + http://www.altova.com + + + + String + String + 2001 + String + String + String + String + String + String + String + String + String + String + String + String + + + String + String + 2001 +

+ String + + + + String + String + http://www.altova.com + + + Text + + String + String + http://www.altova.com + + + String + String + http://www.altova.com + + String + + + + String + String + String + String + + + String + String + String + + + yes + String + String + String + String + String + String + String + String + String + String + + + + String + http://www.altova.com + String + + + + String + http://www.altova.com + + String + String + String + + http://www.altova.com + + + + String + 1967-08-13 + 1967-08-13 + + + + + String +

String

+ String + http://www.altova.com + String + + String + String + + + + String + String + String + Text + Text + + String + + + + String + String + String + String + + String + en-us + String + + http://www.altova.com + String + + String + http://www.altova.com + http://www.altova.com + + + String + http://www.altova.com + http://www.altova.com + + + + String + String + 2001 + String + String + String + String + String + String + String + String + String + String + String + String + + + String + String + 2001 +

String

+ String + http://www.altova.com + String + + String + String + + + + String + String + String + Text + Text + + String + + + + + String + String + String + String + 1967-08-13 + 1967-08-13 + 1967-08-13 + String + String + String + + String + en-us + String + + http://www.altova.com + String + + String + http://www.altova.com + http://www.altova.com + + + String + http://www.altova.com + http://www.altova.com + + + + String + String + 2001 + String + String + String + String + String + String + String + String + String + String + String + String + + + String + String + 2001 +

String

+ String + http://www.altova.com + String + + String + String + + + + String + String + String + Text + Text + + String + + + + +