Annotation of worldwide-digilib/worldwide-digilib.tex, revision 1.1

1.1     ! casties     1: \documentclass[a4paper]{article}
        !             2: 
        !             3: \usepackage[latin1]{inputenc}
        !             4: \usepackage[T1]{fontenc}
        !             5: \usepackage{ae}
        !             6: 
        !             7: \usepackage{url}
        !             8: %\usepackage{hyperref}
        !             9: 
        !            10: %% for latex2rtf :-(
        !            11: % remember to replace "%" in URL!
        !            12: %\newcommand{\url}[1]{\verb!#1!}
        !            13: %\renewenvironment{footnotesize}{}{}
        !            14: 
        !            15: 
        !            16: \newcommand{\digilib}{\texttt{digilib}}
        !            17: 
        !            18: \title{Draft: World Wide digilib -- Resource Identifier in ECHO}
        !            19: 
        !            20: \author{Robert Casties\thanks{IT-Group, Max Planck Institute for the
        !            21:     history of science}} 
        !            22: 
        !            23: \date{Version~0.6 of \today}
        !            24: 
        !            25: \begin{document}
        !            26: 
        !            27: \maketitle
        !            28: 
        !            29: \tableofcontents
        !            30: 
        !            31: \section{Digital Resource Identifier DRI}
        !            32: 
        !            33: The \emph{Digital Resource Identifier} is a worldwide unique
        !            34: identifier for a digital resource. The resource may be an electronic
        !            35: text, single or multiple digital images, an audiovisual media file or
        !            36: other type of electronic resource that is accessible over the
        !            37: Internet.
        !            38: 
        !            39: The identifier provides a stable point of reference for digital
        !            40: resources in the Internet. The identifier is therefore independent
        !            41: from the address, implementation and directory layout of the location
        !            42: of the resource. The identifier is unique and constant and it can be
        !            43: used in other documents to reference the resource without the risk of
        !            44: having a broken reference in the future because the address or filename
        !            45: of the resource has changed.
        !            46: 
        !            47: The identifier supports infrastructure for the ``sustainability'' of
        !            48: digital resources to guarantee that not only the identifier always
        !            49: points to the same resource but also the resource stays available in
        !            50: the Internet. The infrastructure supports backup copies and load
        !            51: balancing mechanisms. The implementation and enduring support of the
        !            52: actual servers and digital resources is in itself mostly an
        !            53: organisational and social challenge that cannot be solved by
        !            54: technological measures alone.
        !            55: 
        !            56: 
        !            57: 
        !            58: \subsection{Structure of the DRI}
        !            59: \label{sec:structure-dri}
        !            60: 
        !            61: The \emph{Digital Resource Identifier} has the following properties:
        !            62: 
        !            63: \begin{itemize}
        !            64: \item Total address space of 70 bit, partitioned into a million
        !            65:   subspaces of 50 bit for $10^{15}$ or 1125 billion different
        !            66:   resources per subspace.
        !            67: 
        !            68: \item The identifier contains only (uppercase) letters and digits.
        !            69:   
        !            70: \item The identifier is composed of a 4 character \emph{subspace} or
        !            71:   \emph{namespace identifier}, a 10 character \emph{resource
        !            72:     identifier} and a 1 character checksum, giving a total of 15
        !            73:   characters for the full DRI.
        !            74: \end{itemize}
        !            75: 
        !            76: 
        !            77: 
        !            78: 
        !            79: \subsection{Character set}
        !            80: \label{sec:charset}
        !            81: 
        !            82: The identifier is composed only of letters and digits. Uppercase and
        !            83: lowercase letters are not distinguished. The resulting character set
        !            84: has $26+10=36$ characters. Four characters with ambiguous shapes that
        !            85: might lead to errors are omitted: ``O'' (vs. ``0''), ``I'' (vs. ``1''
        !            86: or ``l''), ``L'' (vs. ``1'' or ``I''), and ``J'' (vs ``1'' or ``I'').
        !            87: The resulting set of 32 characters can be used to represent 5 bit of
        !            88: information.
        !            89: 
        !            90: \begin{table}[htbp]
        !            91:   \centering
        !            92:   \begin{footnotesize}
        !            93:   \begin{tabular}{cc|cc|cc|cc}
        !            94:     character & value & character & value & character & value &
        !            95:     character & value \\ \hline
        !            96:     0 & 0  & A & 10 & N & 20 & Y & 30 \\
        !            97:     1 & 1  & B & 11 & P & 21 & Z & 31 \\
        !            98:     2 & 2  & C & 12 & Q & 22 \\
        !            99:     3 & 3  & D & 13 & R & 23 \\
        !           100:     4 & 4  & E & 14 & S & 24 \\
        !           101:     5 & 5  & F & 15 & T & 25 \\
        !           102:     6 & 6  & G & 16 & U & 26 \\
        !           103:     7 & 7  & H & 17 & V & 27 \\
        !           104:     8 & 8  & K & 18 & W & 28 \\
        !           105:     9 & 9  & M & 19 & X & 29 \\
        !           106:   \end{tabular}
        !           107:   \end{footnotesize}
        !           108:   \caption{Character set for identifier}
        !           109:   \label{tab:chartable}
        !           110: \end{table}
        !           111: 
        !           112: The 50 bit of the chosen address for the resource is divided into ten
        !           113: pieces of 5 bit. The pieces are each encoded into one character
        !           114: according to the character table in table~\ref{tab:chartable}. The
        !           115: resulting string of 10 characters is called the \emph{resource
        !           116:   address}.
        !           117: 
        !           118: 
        !           119: 
        !           120: 
        !           121: \subsection{Namespaces}
        !           122: \label{sec:namespaces}
        !           123: 
        !           124: The total address space of 70 bit is divided into $2^{20}$ (1048576)
        !           125: subspaces of 50 bit. These subspaces, also called namespaces, can be
        !           126: assigned to institutions that wish to implement their own allocation
        !           127: of resource identifiers for reasons of efficiency and maintenance. All
        !           128: resulting resource identifiers are only valid once they are registered
        !           129: with the central \emph{resource registry}.
        !           130: 
        !           131: Each subspace is identified by a four-character \emph{name
        !           132:   space identifier}. The 10 character \emph{resource address} is
        !           133: prefixed with the \emph{name space identifier}, resulting in a 14
        !           134: character \emph{unique address} for each resource.
        !           135: 
        !           136: Subspaces and their name space identifier are registered by the
        !           137: central resource registry. An institution or project that wishes to
        !           138: implement its own allocation of resource identifiers contacts the
        !           139: resource registry and receives a name space identifier for a currently
        !           140: unused subspace. The subspace is then marked as being used by this
        !           141: institution or project. New resource identifiers in this subspace can
        !           142: only be assigned by the institution or project that owns the subspace.
        !           143: 
        !           144: The central resource registry allocates and registers resource
        !           145: identifiers for institutions, projects and individuals that do not
        !           146: want to maintain their own subspace. Resource identifiers allocated by
        !           147: the central resource registry are in the \texttt{ECHO} namespace.
        !           148: 
        !           149: The namespaces \texttt{0000}, \texttt{TEMP} and \texttt{ECHO} are
        !           150: reserved for use with the central resource registry.
        !           151: 
        !           152: 
        !           153: \subsection{Checksum}
        !           154: \label{sec:checksum}
        !           155: 
        !           156: A checksum of one character (5 bit) is calculated over the 14
        !           157: characters (70 bit) of the \emph{unique address}. The checksumming method is
        !           158: similar to the method used for ISBN (International Standard Book
        !           159: Number). The differences are the number system, which is base-32 for
        !           160: the DRI (ISBN: base-10) and the modulus, which is 31 for the DRI
        !           161: (ISBN: 11).
        !           162: 
        !           163: The checksum number is calculated with the formula
        !           164: \begin{displaymath}
        !           165:   c = \sum_{i=1..14} i x_i \pmod{31}
        !           166: \end{displaymath}
        !           167: 
        !           168: The resulting checksum number $c$ is converted to a character
        !           169: according to table~\ref{tab:chartable} and appended to the end of the
        !           170: \emph{unique address} giving the full \emph{Digital Resource
        !           171:   Identifier}.
        !           172: 
        !           173: The DRI is only valid if the checksum calculated over the unique
        !           174: address part of the identifier (the first 14 characters) matches the
        !           175: checksum value (the last character).
        !           176: 
        !           177: 
        !           178: 
        !           179: 
        !           180: \section{Central resource registry}
        !           181: \label{sec:central-registry}
        !           182: 
        !           183: The central resource registry is the keystone in the concept of stable
        !           184: and sustainable digital resource identifiers and references. Resources
        !           185: can be moved and renamed on local servers, duplicated onto other
        !           186: servers and servers can even be shut down (given the resource had been
        !           187: duplicated) without resources getting lost or breaking links or
        !           188: references to the resource.
        !           189: 
        !           190: The resource registry server acts as a switchboard between the user
        !           191: requests for a resource and local servers providing the resource. URLs
        !           192: and other so called ``global'' references to a resource via its DRI
        !           193: access the resource registry server that dispatches the request to the
        !           194: local server. In this way only the resource registry server's address
        !           195: has to remain stable.
        !           196: 
        !           197: This places a high burden of availability on the registry server. This
        !           198: challenge can be met on a technical level with standard technology
        !           199: (transparent replication and load balancing) and scaled to higher
        !           200: performance levels when the demand rises. More importantly a durable
        !           201: solution has to be established on the organizational and social level
        !           202: for running the server.
        !           203: 
        !           204: The resource registry maintains the mapping database between the
        !           205: digital resource identifiers and the location of the resources on the
        !           206: local servers. In this way it has a list of all known resource
        !           207: identifiers and ensures that all resource identifiers are unique.
        !           208: 
        !           209: The database on the resource registry server can additionally store a
        !           210: set of minimal meta informations on the resources and provide
        !           211: searches in this metadata. One item of this minimal meta information
        !           212: should be a URL to further information on the resource.
        !           213: 
        !           214: The resource registry server provides a HTTP redirect function for
        !           215: transparent HTTP access to resources and optionally other webservice
        !           216: access (XML-RPC, SOAP).
        !           217: 
        !           218: Special client software for accessing resources can harvest and cache
        !           219: DRI mappings from the central registry for short times to improve
        !           220: performance or offline work. 
        !           221: 
        !           222: As mentioned in chapter~\ref{sec:namespaces} parts of the resource
        !           223: identifier address space can be assigned to institutions or projects
        !           224: to implement their own allocation of resource identifiers. These
        !           225: identifiers are generally valid only after they have been registered
        !           226: with the central resource registry.
        !           227: 
        !           228: The central resource registry remains the only authoritative source of
        !           229: digital resource identifiers and their mapping to local resources.
        !           230: 
        !           231: The resource registry provides interfaces to
        !           232: 
        !           233: \begin{itemize}
        !           234: \item redirect HTTP requests with resource identifiers to local
        !           235:   resource servers
        !           236: 
        !           237: \item query the mapping of resource identifiers using a webservice
        !           238:   interface
        !           239: 
        !           240: \item hand out new resource identifiers and acquire the necessary
        !           241:   mapping information
        !           242: 
        !           243: \item change resource mapping information or resource meta information
        !           244: 
        !           245: \item query the database for meta information
        !           246: 
        !           247: \item upload sets of externally allocated resource identifiers
        !           248: 
        !           249: \item download sets of identifiers or the whole database for caching
        !           250:   purposes.
        !           251: \end{itemize}
        !           252: 
        !           253: 
        !           254: 
        !           255: \subsection{Handling of digital resource identifiers in HTTP
        !           256:   requests}
        !           257: \label{sec:dri-resolution-http}
        !           258: 
        !           259: A global HTTP request usually accesses a digital resource via some
        !           260: kind of display tool (for example \digilib{}) that is able to render a
        !           261: web representation of the resource. While the resource identifier is
        !           262: embedded in the DRI part of the URL, other aspects of the rendering
        !           263: (for example which tool to use) are embedded in other parts of the URL
        !           264: that may be specific to the display tool. Therefore the registry
        !           265: server has to treat URLs differently depending on the display tool.
        !           266: 
        !           267: The handling of HTTP requests has three steps:
        !           268: \begin{enumerate}
        !           269: \item Identification of the DRI in the request string.
        !           270: 
        !           271: \item Lookup of additional information on the handling of the request
        !           272:   based on the DRI.
        !           273: 
        !           274: \item Redirect of the client to the local resource server.
        !           275: \end{enumerate}
        !           276: 
        !           277: The first part of the treatment of the URL is the identification of
        !           278: the DRI in the HTTP request string. Three basic ways of handling the
        !           279: DRI are envisaged:
        !           280: 
        !           281: \begin{itemize}
        !           282: \item The DRI can be embedded as part of the URI path\footnote{The
        !           283:     first part of the URI path, separated by slashes, that is a valid
        !           284:     DRI string.} (\url{http://driserver.echo.eu/dri/ECHO00001A2B3CX}),
        !           285: 
        !           286: \item it can be provided as a special HTTP GET or POST parameter for a
        !           287:   defined environment like \digilib{}\footnote{The environment itself
        !           288:     should be identified by the first parts of the URI path.}
        !           289:   (\url{http://driserver.echo.eu/digilib/digilib.jsp?dri=ECHO00001A2B3CX&pn=5})
        !           290:   or
        !           291:   
        !           292: \item it can be extracted from the request by a generic pattern
        !           293:   matching scheme (this option is computationally most expensive)
        !           294: \end{itemize}
        !           295: 
        !           296: Once the DRI is identified more information about the resource can be
        !           297: looked up in the central resource database. From this point on the
        !           298: redirection of the request can be handled differently depending on the
        !           299: record type information in the database.
        !           300: 
        !           301: An extensible set of URL rewrite rules will be implemented by the
        !           302: server. The type of rule to be used is part of the resource record of
        !           303: the DRI in the central resource registry. The following rules should
        !           304: be part of the first implementation of the registry server:
        !           305: 
        !           306: \begin{description}
        !           307: 
        !           308: \item[redirect] only the host part of the URL is replaced by the local
        !           309:   host name from the resource record.
        !           310: 
        !           311: \item[replace] the full URL is replaced by the local URL from the
        !           312:   resource record.
        !           313: 
        !           314: \item[\digilib{}] the host part of the URL is replaced by the local host
        !           315:   name from the resource record and the remaining part is replaced according
        !           316:   to \digilib{} rules.
        !           317: 
        !           318: \item[rewrite] the host part of the URL is replaced by the local host
        !           319:   name from the resource record and the remaining part is replaced according to
        !           320:   generic substitution rules with wildcard patterns.
        !           321: \end{description}
        !           322: 
        !           323: The introduction of other specialized types of rewrite rules can be
        !           324: implemented as extension modules to the resource server.
        !           325: 
        !           326: 
        !           327: 
        !           328: \subsubsection{Redirect and replace type DRI resolution}
        !           329: \label{sec:redirect-type-dri}
        !           330: 
        !           331: When a DRI resource record has a resolution type of ``redirect'', then
        !           332: only the host part of the URL is replaced in the redirected request by
        !           333: the local host given in the resource record. See
        !           334: table~\ref{tab:redirect-resolv}.
        !           335: 
        !           336: \begin{table}[htbp]
        !           337:   \centering
        !           338:   \begin{tabular}{lp{0.7\textwidth}}
        !           339:     incoming request & \url{http://driserver.echo.eu/dri/ECHO00001A2B3CX} \\
        !           340:     \texttt{local\_host} record & \texttt{penelope.unibe.ch} \\
        !           341:     redirect request & \url{http://penelope.unibe.ch/dri/ECHO00001A2B3CX}
        !           342:   \end{tabular}
        !           343:   \caption{redirect type DRI resolution}
        !           344:   \label{tab:redirect-resolv}
        !           345: \end{table}
        !           346: 
        !           347: When a DRI resource record has a resolution type of ``replace'', then
        !           348: the whole URL is replaced in the redirected request by the local URL
        !           349: given in the resource record. See table~\ref{tab:replace-resolv}.
        !           350: 
        !           351: \begin{table}[htbp]
        !           352:   \centering
        !           353:   \begin{tabular}{lp{0.7\textwidth}}
        !           354:     incoming request & \url{http://driserver.echo.eu/dri/ECHO00001A2B3CX} \\
        !           355:     \texttt{local\_url} record & \url{http://penelope.unibe.ch/docuserver/compago/compare.pl?32} \\
        !           356:     redirect request & \url{http://penelope.unibe.ch/docuserver/compago/compare.pl?32}
        !           357:   \end{tabular}
        !           358:   \caption{replace type DRI resolution}
        !           359:   \label{tab:replace-resolv}
        !           360: \end{table}
        !           361: 
        !           362: 
        !           363: 
        !           364: \subsubsection{\digilib{} type DRI resolution}
        !           365: \label{sec:digilib-type-dri}
        !           366: 
        !           367: When a DRI resource record has a resolution type of ``\digilib{}'', then
        !           368: the host part of the URL is replaced by the local host in the resource
        !           369: record and the remaining part is replaced according to \digilib{}
        !           370: parameter format.
        !           371: 
        !           372: In the preferred parameter-style format the DRI is given as the
        !           373: parameter ``dri''. The local URL for the redirect is constructed by
        !           374: replacing the URI path up to the ``?'' with the digilib path from the
        !           375: resource record and adding a local filename as parameter ``fn''. See
        !           376: table~\ref{tab:digilib-resolv}.
        !           377: 
        !           378: \begin{table}[htbp]
        !           379:   \centering
        !           380:   \begin{tabular}{lp{0.7\textwidth}}
        !           381:     incoming request &
        !           382:     \url{http://driserver.echo.eu/digilib/digilib.jsp?dri=ECHO00001A2B3CX&pn=5} \\
        !           383:     \texttt{local\_host} record & \texttt{penelope.unibe.ch} \\
        !           384:     \texttt{digilib\_path} record & \texttt{/docuserver/digitallibrary/digilib.jsp} \\
        !           385:     \texttt{digilib\_file} record & \texttt{public/Beispiele} \\
        !           386:     redirect request &
        !           387:     \url{http://penelope.unibe.ch/docuserver/digitallibrary/digilib.jsp?dri=ECHO00001A2B3CX&fn=public/Beispiele&pn=5} 
        !           388:   \end{tabular}
        !           389:   \caption{digilib type DRI resolution}
        !           390:   \label{tab:digilib-resolv}
        !           391: \end{table}
        !           392: 
        !           393: In the deprecated plus-style format the DRI could be placed the first
        !           394: part of the parameter path, prefixed with ``dri:''. In the local URL
        !           395: the local pathname is appended to the DRI part.
        !           396: 
        !           397: 
        !           398: \subsubsection{Rewrite type DRI resolution}
        !           399: \label{sec:rewrite-type-dri}
        !           400: 
        !           401: When a DRI resource record has a resolution type of ``rewrite'', then
        !           402: the host part of the URL is replaced by the local host name from the
        !           403: resource record and the remaining part is replaced according to
        !           404: generic substitution rules with wildcard patterns.
        !           405: 
        !           406: 
        !           407: 
        !           408: \subsection{Handling of digital resource identifiers as a web service}
        !           409: \label{sec:handl-dri-web}
        !           410: 
        !           411: The basic function of resolution of a DRI as well as other maintenance
        !           412: functions like the registration of new DRIs or the download of parts
        !           413: or all registered DRI mappings should also be accessible with a web
        !           414: service interface.
        !           415: 
        !           416: Specifications for the web service interface have to be established.
        !           417: 
        !           418: 
        !           419: \section{Resource metadata}
        !           420: \label{sec:resource-metadata}
        !           421: 
        !           422: The set of metadata about a resource that is stored on the resource
        !           423: server is called a \emph{resource record}. Since the requirements of
        !           424: access, structure and amount of metadata for different projects can
        !           425: hardly be generalized the resource server stores only a minimal set of
        !           426: fields that is sufficient for the basic functions of access to the
        !           427: resource, sustainability of access, and interoperability. More
        !           428: extensive and project specific metadata sets should be stored and
        !           429: maintained on external servers. The optional resource information
        !           430: field can be used to point to external metadata representations.
        !           431: 
        !           432: 
        !           433: \subsection{Basic metadata}
        !           434: \label{sec:basic-metadata}
        !           435: 
        !           436: The amount of metadata is dependent on the type of resource record.
        !           437: Common to all records is the \texttt{dri} field for the resource
        !           438: identifier.  Redirect-type records require an additional
        !           439: \texttt{local\_host} field for the host name of the local host.
        !           440: Replace-type records require an \texttt{local\_url} field for a full
        !           441: URL. Digilib-type records require at least the three fields
        !           442: \texttt{local\_host}, \texttt{digilib\_path}, and
        !           443: \texttt{digilib\_file} and an optional parameter
        !           444: \texttt{digilib\_pageno}. The basic fields can be found in
        !           445: table~\ref{tab:basic-meta}.
        !           446: 
        !           447: \begin{table}[htbp]
        !           448:   \centering
        !           449:   \begin{tabular}{lr|l}
        !           450:     type & field & description \\ \hline
        !           451:     \textbf{redirect} & & \\
        !           452:     & \texttt{record\_type} & type of record (``redirect'') \\
        !           453:     & \texttt{dri} & DRI \\
        !           454:     & \texttt{local\_host} & local host name \\ \hline
        !           455:     \textbf{replace} & & \\
        !           456:     & \texttt{record\_type} & type of record (``replace'') \\
        !           457:     & \texttt{dri} & DRI \\
        !           458:     & \texttt{local\_url} & full local URL \\ \hline
        !           459:     \textbf{digilib} & & \\
        !           460:     & \texttt{record\_type} & type of record (``digilib'') \\
        !           461:     & \texttt{dri} & DRI \\
        !           462:     & \texttt{local\_host} & local digilib server \\
        !           463:     & \texttt{digilib\_path} & URI path of the digilib installation \\
        !           464:     & \texttt{digilib\_file} & digilib path name (parameter fn) \\
        !           465:     & \texttt{digilib\_pageno} & optional page number
        !           466:     (parameter pn)
        !           467:   \end{tabular}
        !           468:   \caption{Basic metadata fields}
        !           469:   \label{tab:basic-meta}
        !           470: \end{table}
        !           471: 
        !           472: The resource server may implement additional fields like owner and
        !           473: group fields for internal management and user access functions.
        !           474: 
        !           475: 
        !           476: \subsection{Alternate server and backup server}
        !           477: \label{sec:redund-serv-back}
        !           478: 
        !           479: The resource server architecture is designed to fulfill high demands
        !           480: on the performance and sustainability of access to the
        !           481: resources. These demands can be met by a loosely coupled network of
        !           482: local servers duplicating content for backup and the transparent
        !           483: sharing of concurrent access to resources for enhanced
        !           484: performance.
        !           485: 
        !           486: Backup server fields give the names and paths of servers that provide
        !           487: copies of the resource. Requests for the resource are diverted to a
        !           488: backup server when the original server becomes unavailable.
        !           489: 
        !           490: Alternate server fields give the names paths of servers that provide
        !           491: copies of the resource. Requests for a resource are spread among all
        !           492: alternate servers for the same resource according to a load-balancing
        !           493: pattern. The pattern can be a simple round-robin scheme or a more
        !           494: sophisticated scheme based on server performance or the geographical
        !           495: location of client and server.
        !           496: 
        !           497: A resource record can have any number of backup server and alternate
        !           498: server fields. If a resource is required to have at least one backup
        !           499: server is a policy decision of the hosting project that is not
        !           500: enforced by the resource server.
        !           501: 
        !           502: 
        !           503: 
        !           504: \subsection{Additional resource information}
        !           505: \label{sec:addt-reso-inform}
        !           506: 
        !           507: The resource server itself carries only minimal metadata on a resource
        !           508: but it provides a basic mechanism to store and access more extensive
        !           509: information on external servers.
        !           510: 
        !           511: Every resource record can have a resource info URL that is stored in
        !           512: the \texttt{info-url} field.
        !           513: 
        !           514: \begin{table}[htbp]
        !           515:   \centering
        !           516:   \begin{tabular}{l|l}
        !           517:     field & description \\ \hline
        !           518:     \texttt{info-url} & URL to external information
        !           519:   \end{tabular}
        !           520:   \caption{External resource information}
        !           521:   \label{tab:extern-reso-inform}
        !           522: \end{table}
        !           523: 
        !           524: The external resource information can be accessed in a standardized
        !           525: way on the resource server where the DRI of the resource is part of
        !           526: the URI path: \url{http://driserver.echo.eu/resinfo/ECHO00001A2B3CX/}
        !           527: Requests to this URL will be redirected to the URL in the
        !           528: \texttt{info-url} field in the resource record.
        !           529: 
        !           530: 
        !           531: \end{document}

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>