--- storage/meta/meta-format.tex 2003/07/01 17:51:40 1.3 +++ storage/meta/meta-format.tex 2003/07/23 10:35:06 1.4 @@ -7,16 +7,16 @@ %\usepackage{courier} % create in-text links black (with PDF) -\usepackage[colorlinks=true,linkcolor=black]{hyperref} +%\usepackage[colorlinks=true,linkcolor=black]{hyperref} % Format URLs nicely (without PDF) -%\usepackage{url} +\usepackage{url} \title{A simple metadata format for resource bundles} -\author{Robert Casties, Dirk Wintergrün, Christoph Liess} +\author{Robert Casties, Dirk Wintergrün, Hans-Christoph Liess} -\date{V0.2.2 of \today} +\date{V0.3pre2 of \today} \begin{document} @@ -35,6 +35,32 @@ in filenames are only the alphanumeric s File and directory paths in the metadata file use the conventional Unix file separator slash ``/''. + +\section{Metadata files} +\label{sec:metadata-files} + +The metadata information is stored in the XML format documented below +in special files in the resource directory. Two forms of metadata +files are possible: +\begin{itemize} +\item a file named \texttt{index.meta} in a directory. + +\item a file named like the data file it describes with an + additional extension \texttt{.meta}. For example metadata for the + file \texttt{0001.tif} would be in a file \texttt{0001.tif.meta}. +\end{itemize} + +The resource directory must contain an \texttt{index.meta} file with +information about the resource as a whole. Other directories can +contain \texttt{index.meta} files. + +Additional information about single data files that are part of the +resource can either be put in \texttt{file} tags in the +\texttt{index.meta} file or in separate \emph{filename}\texttt{.meta} +files for each data file. Information from the directory level file is +inherited at the file level. + + \section{Resource format} \label{sec:mpiwg-doc} @@ -43,10 +69,10 @@ by the provider of the resource and may the metadata file. Elements marked ``required'' must be supplied by the provider of the resource. Elements marked ``deduced'' can be supplied by the provider of the resource but can also be provided by -automatic scripts later in the process, the elements must be present +automatic scripts later in the process, these elements must be present in the final file. -The outer container is named \texttt{resource}. Sub-types (``ECHO'', +The outer container element is \texttt{resource}. Sub-types (``ECHO'', ``MPIWG'') can be specified if necessary with a \texttt{type} parameter. Its sub-elements are: @@ -59,9 +85,12 @@ parameter. Its sub-elements are: \item[creator] The name of the project or person that created the resource -- optional. + +\item[archive-creation-date] The time and date the archive collection + was created -- deduced. -\item[archive-creation-date] The time and date the archive was created - -- deduced. +\item[archive-storage-date] The time and date the archive was written + to permanent storage -- deduced (must not be set by the user). \item[archive-path] The full path to the resource directory inside the whole archive collection -- deduced. @@ -164,17 +193,45 @@ parameter. Its sub-elements are: All elements with \texttt{meta} tags can contain an arbitrary number of additional metadata elements. +\subsection{Language} +\label{sec:lang} + +The language of a resource (e.g. a text) can be specified with a +\texttt{lang} tag. Languages have to be described using the +international codes for the representation of names of languages +either in two-letter form (ISO 639-1) or in three-letter form (ISO +639-2). The entire catalogue of languages is documented on the page + +\url{http://www.loc.gov/standards/iso639-2/englangn.html} + \subsection{DRI} \label{sec:dri} The \emph{digital resource identifier} for the resource is specified -with a \texttt{dri} tag. Digital resource identifiers are documented +in a \texttt{dri} element. Digital resource identifiers are documented on the page \url{http://pythia.mpiwg-berlin.mpg.de/projects/standards/dri}. + +\subsection{Collection context} +\label{sec:collection-context} + +The context of a resource as part of a collection or part of a project can be +specified in the \texttt{context} element: + +\begin{description} +\item[link] URL to additional context information. + +\item[name] Textual description of project or collection. +\end{description} +\noindent multiple \texttt{link} or \texttt{name} elements are +possible. + + + \subsection{Bibliographic information} \label{sec:bibliographic-data} @@ -182,22 +239,327 @@ Bibliographic information in the format bibliographic data (cf. content workflow) or the MPIWG ``Projektbibliografie'' is presented in a \texttt{bib} container with a \texttt{type} parameter, giving the type of bibliographic resource. -The \texttt{type} field is repeated as a tag in the container. The -tags have the variable ``human-readable'' field names. +The \texttt{type} field can be repeated as a tag in the container. + + +\subsubsection{Book} + +\begin{description} + +\item [bib type="book"] a published book. + + \begin{description} + \item [author] The author of the book. + \item [year] The year of publication. + \item [title] Title of the book. + \item [series-editor] Name of the series editor, if the book appears + in a series. + \item [series-title] Title of the serie, if the book appears in a + series. + \item [series-volume] Volume number, if the book appears in a + series. + \item [number-of-pages] Number of pages of the entire book. + \item [city] City where the book was published. + \item [publisher] Name of the publishing company + \item [edition] Edition of the book (e.g. third edition) + \item [number-of-volumes] Number of volumes, if the the book is + published in multiple volumes. + \item [translator] Name of the translator. + \item [isbn-issn] + \end{description} +\end{description} + +\subsubsection{In Book} + +\begin{description} +\item [bib type="inbook"] an article as part of a book. + + \begin{description} + \item [author] The author of the book. + \item [year] The year of publication. + \item [title] Title of the article. + \item [editor] Name of the book's editor. + \item [book-title] Title of the book. + \item [series-volume] Volume number, if the book appears in a + series. + \item [pages] Number of pages of the article. + \item [city] City where the book was published. + \item [publisher] Name of the publishing company + \item [edition] Edition of the book (e. g. third edition) + \item [series-author] Name of the series editor, if the book appears + in a series. + \item [series-title] Title of the series, if the book appears in a + series. + \item [number-of-volumes] Number of volumes, if the the book is + published in multiple volumes. + \item [translator] Name of the translator + \item [isbn-issn] + \end{description} +\end{description} + +\subsubsection{Proceedings} + +\begin{description} +\item [bib type="proceedings"] a conference proceedings publication. + + \begin{description} + \item [author] The author of the article. + \item [year] The year of publication. + \item [title] Title of the article. + \item [editor] Name of the book's editor. + \item [conference-name] Name of the conference the proceedings are + related to. + \item [volume] Volume number. + \item [pages] Number of pages of the article. + \item [date] Date of the conference the proceedings are related to. + \item [conference]-location City where the conference was held. + \item [publisher] Name of the publishing company + \item [edition] Edition of the book (e. g. third edition) + \item [series-editor] Name of the series editor, if the book appears + in a series. + \item [series-title] Title of the series, if the book appears in a + series. + \item [number-of-volumes] Number of volumes, if the the book is + published as multiple volumes. + \item [isbn-issn] + \end{description} +\end{description} + +\subsubsection{Edited Book} + +\begin{description} +\item[bib type="edited-book"] a book that is the edition of another + work. + + \begin{description} + \item [editor] Name of the editor of the book. + \item [year] The year of publication. + \item [title] Title of the book. + \item [series-editor] Name of the editor of the series the book is + part of. + \item [series-title] Title of the series, if the book is part of a + series. + \item [series-volume] Volume number, if the book appears in a series. + \item [number-of-pages] Number of pages of the article. + \item [city] City where the book was published. + \item [publisher] Name of the publishing company + \item [edition] Information about the edition (e.g. ``Repr. of the London ed. 1652'') + \item [number-of-volumes] Number of volumes, if the the book is + published as multiple volumes. + \item [isbn-issn] + \end{description} +\end{description} + +\subsubsection{Journal Article} + +\begin{description} +\item [bib type="journal-article"] an article in a scientific journal. + \begin{description} + \item [author] The author of the article. + \item [year] The year of publication. + \item [title] Title of the article. + \item [journal] Name of the journal. + \item [volume] Volume number, if the journal appears in a series. + \item [issue] Number of the issue the article is part of. + \item [pages] Number of pages of the article. + \item [alternate-journal] Alternate Journal + \item [isbn-issn] + \end{description} +\end{description} + +\subsubsection{Magazine Article} + +\begin{description} +\item [bib type="magazine-article"] an article in a popular magazine. + \begin{description} + \item [author] The author of the book. + \item [year] The year of publication. + \item [title] Title of the article. + \item [magazine] Name of the magazine. + \item [volume] Volume number, if the book appears in a series. + \item [issue-number] Number of the issue the article is part of. + \item [pages Number] of pages of the article. + \item [date] Date when the article appeared. + \end{description} +\end{description} + +\subsubsection{Newspaper Article} + +\begin{description} +\item [bib type="newspaper-article"] an article in a newspaper. + \begin{description} + \item [author] The author of the article. + \item [year] The year of publication. + \item [title] Title of the article. + \item [Newspaper] Name of the newspaper the article appeared in. + \item [pages] Number of pages of the article. + \item [issue-date] Date of the issue the article is part of. + \item [city] City of the newspaper. + \end{description} +\end{description} + +\subsubsection{Thesis} + +\begin{description} +\item [bib type="thesis"] a master/doctorate/etc. thesis. + \begin{description} + \item [author] The author of the thesis. + \item [year] The year of publication. + \item [title] Title of the thesis. + \item [academic-department] Name of the academic department where + the thesis was handed in. + \item [number-of-pages] Number of pages of the thesis. + \item [city] City where the thesis was published. + \item [University] Name of the university where the thesis was + handed in. + \item [isbn-issn] + \end{description} +\end{description} + +\subsubsection{Report} + +\begin{description} +\item [bib type="report"] a scientific report. + \begin{description} + \item [author] The author of the report. + \item [year] The year of publication. + \item [title] Title of the report. + \item [pages] Number of pages of the report. + \item [date] Date when the report appeared. + \item [city] City where the book was published. + \item [institution] Institution where the report was produced. + \item [type] Type of report. + \item [report-number] Report number. + \end{description} +\end{description} + +\subsubsection{Generic} + +\begin{description} +\item [bib type="generic"] a generic bibliographic type. This type + should only be used in rare cases. + \begin{description} + \item [author] + \item [year] + \item [title] + \item [secondary-author] + \item [secondary-title] + \item [volume] + \item [number] + \item [pages] + \item [date] + \item [place-published] + \item [publisher] + \item [edition] + \item [tertiary author] + \item [tertiary-title] + \item [number-of-volumes] + \item [type-of-work] + \item [subsidiary author] + \item [alternate-title] + \item [isbn-issn] + \item [call-number] + \item [label] + \item [keywords] + \item [abstract] + \item [notes] + \item [url] +\end{description} +\end{description} + + +\subsection{Architectural drawings} +\label{sec:doc} + +Specific information for architectural drawings is presented in a +\texttt{doc} container. All elements can appear multiple times. + +\begin{description} +\item [person] last name and first name of a person, separated by a + comma. A further common name for the person can be put infront, + separated by a semicolon. +\item [location] Name of a place in its common notation. This can + be a city or a institution. +\item [date] This can be a year (or several years, separated by commas) or a period + (1706-1714). Years are noted with four digits. +\item [object] Short description of an object or signatures. +\item [keywords] Keywords related to the object. +\end{description} \subsection{Information on the document structure (table of contents)} \label{sec:toc} -Document structure information like a table of contents for a scanned -document is presented in a \texttt{toc} container. The format to be -used has to be further specified. The format could be based on the so -called ``LiSe-XML'' format. For a detailed description and an -exemplary set of TOC information see: +Information on the structure of a document like the division into +parts and chapters in the way of a table of contents is presented in a +\texttt{toc} container. + +The scheme allows multiple logical pages on a single page image +as it is often the case with scanned books or manuscripts. The scheme +also allows for ``loose'' numbering schemes with roman, arabic or +other page numbers consecutively or mixed and changes in the numbering +within the document. + +The flexibility comes from the fact that no additional assumptions +about the mapping between logical pages and page images are made in +the format. All mapping information is specified by the user. + +The logical page numbering or naming that can be presented to the user +is specified in the \texttt{name} tags while the physical numbering of +the page images is specified in the \texttt{index} or \texttt{url} +tags. + +\begin{description} +\item[page] describes a single logical page + \begin{description} + \item[name] the ``name'' of the logical page. This can be any string + like a page number (arabic, roman, etc.) or a special designation + like ``Table 5''. + + \item[index] the \texttt{digilib} index number\footnote{The index + number for digilib is the index in the alphabetical order of the + scan file names.} of the scan image of the page. + + \item[url] alternatively to the \texttt{digilib} index number the + full URL of the scan image of the page can be used. + \end{description} + +\item[chapter] describes a section or chapter of the text. + \texttt{chapter} elements can be nested. + \begin{description} + \item[name] the title of the chapter or section. + + \item[start] the beginning of a page range (usually the first page + of the chapter). The \texttt{start} element has an optional + \texttt{increment} attribute to indicate the number of logical + pages on a scan image.\footnote{This information is only needed by + additional tools that try to generate lists of all page and + image numbers.} + \begin{description} + \item[name] the ``name'' of the first page (see \texttt{page}). + + \item[index] the index of the first page (see \texttt{page}). -\url{http://pythia.mpiwg-berlin.mpg.de/toolserver/TS_lise} + \item[url] the URL of the first page (see \texttt{page}). + \end{description} + + \item[end] the end of a page range (usually the last page of the + chapter). + \begin{description} + \item[name] the ``name'' of the last page (see \texttt{page}). -\url{http://pythia.mpiwg-berlin.mpg.de/toolserver/TSlise/lise_downloads/deimel1929.xml} + \item[index] the index of the last page (see \texttt{page}). + + \item[url] the URL of the last page (see \texttt{page}). + \end{description} + + \item[page] alternative (and additional) to + \texttt{start}/\texttt{end} page ranges single \texttt{page} + elements can be used inside \texttt{chapter}. + \end{description} +\end{description} + +%%\url{http://pythia.mpiwg-berlin.mpg.de/toolserver/TS_lise} \subsection{Information on scanned images} @@ -250,33 +612,32 @@ reasons then the restrictions can be put inside the container has to be further specified. -\section{Sample metadata file for an ECHO resource} - -The following is the sample structure for a scanned document resource. +\section{Sample metadata files for ECHO resources} +The following is a sample structures for a scanned document. \begin{verbatim} - + Fleck, 1980 fleck.1980 University of Bern - ubern/wiss-theorie scanned images echo23a45e2329x + ger Fleck, Ludwik 1980 Entstehung und Entwicklung einer wissenschaftlichen Tatsache - - - - + + + + Frankfurt am Main Suhrkamp - + Wissenschaftstheorie, Fleck, Tatsache @@ -286,12 +647,47 @@ The following is the sample structure fo Scanned images (300dpi) img - - \end{verbatim} +The following is a sample metadata structure for an architectural +drawing. + +\begin{verbatim} + + Bibliotheca Hertziana + scanned images + + 00000271-asl-160-r-full.tif + + + 315 + + echo45a67bc4367d + ita + + Ciolli, Giacomo + Urban VIII; Barberini, Maffeo + Accademia di San Luca + Roma + 1706 + Concorso Clementino + Fontana Pubblica + Brunnen + ASL 160 + + + + http://colosseum.biblhertz.it:8080/Lineamenta/ + 1033478408.39/1035196181.35/1035196204.09/1035394121.83 + + + + + +\end{verbatim} + \end{document} %%% Local Variables: