#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- # Time-stamp: <2019-09-13 15:17:52 (kthoden)> __doc__="""This module provides eight functions that might be useful for some scripts, for example Filter_3_05_add_basic_xml.py.""" def get_metadata(echopath): """Use python's lxml to get the metadata from the index.meta file.""" import urllib import sys from lxml import etree import common_functions repo = "http://content.mpiwg-berlin.mpg.de/mpiwg/online/" # repo = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=/" fullpath = "%s%s/index.meta" % (repo, echopath) common_functions.consoleDebug("Trying to get metadata") # Get the file and read it try: usock=urllib.urlopen(fullpath) except IOError as (errno, strerror): common_functions.printlnInERROR("I/O error ({0}): {1}".format(errno,strerror)) sys.exit() try: indexmetaTree = etree.parse(usock) except etree.XMLSyntaxError: common_functions.printlnInERROR("Online resource %s not found. Keep in mind that you must specify the whole path, e. g. /permanent/library/UR271U6Y." % fullpath) sys.exit() usock.close() # first, we need to check the version (and hopefully, the version of index.meta has been written into the root tag) version = indexmetaTree.xpath('/resource/@version') if len(version) == 0 or version[0] != "2.0": common_functions.consoleDebug("found an older index.meta") return(parse_old_index_meta(indexmetaTree)) elif version[0] == "2.0": common_functions.consoleDebug("index.meta 2.0 found") return(parse_index_meta(indexmetaTree)) # def get_metadata ends here def get_gnd(creator): """Parses the creator string for semicolon which is used to delimit several creators. Right now, it is hardcoded to '; ', but this might not always be the case.""" import json import common_functions import codecs import os # use the json file GND_FILE = (os.environ["ECHO_SCRIPTS_DIR"]+"/share/texts/resources/gnd.json") gnd = json.load(codecs.open(GND_FILE, 'r', 'utf-8')) # initialize a list creators=[] # we might have several creators, split by "; ". The rule is a bit hard-coded at the moment creators=creator.split("; ") # initialize another list author_list=[] # Loop through all the creators, find their gnd and put them in a # list. If the gnd is not there, we put that into the list. for i in creators: try: creator_item = (gnd[i],i) print('Found GND of author (%s) in the database. If you want to check this entry in the DNB, please click here.' % (i,gnd[i])) common_functions.consoleStatus('Found GND of author (%s) in the database. If you want to check this entry in the DNB, please click here.' % (i,gnd[i])) # common_functions.printlnInConsole('Found GND of author (%s) in the database. If you want to check this entry in the DNB, please click here.' % (i,gnd[i])) author_list.append(creator_item) except KeyError: # common_functions.printlnInConsole("GND of author (%s) is not in database." % i) common_functions.consoleWarn("GND of author (%s) is not in database." % i) creator_item = ("GND not available",i) author_list.append(creator_item) return author_list # def get_gnd ends here def get_lang_list(language): import json import common_functions import codecs import os common_functions.consoleDebug("Language 1: %s!" % language) def find_key(dic,val): """The json dictionary should be searchable both ways. Thanks to the internet for that solution""" return [k for k, v in dic.iteritems() if v == val][0] # def find_key ends here # use the json file for looking up language codes ISO639_FILE = (os.environ["ECHO_SCRIPTS_DIR"]+"/share/texts/resources/iso639.json") # fh = codecs.open(ISO639_FILE, 'r', 'utf-8') # content = fh.read() iso639 = json.load(codecs.open(ISO639_FILE, 'r', 'utf-8')) lang_list = [] if len(language) == 0: lang_list.append(language) lang_list.append(language) if len(language) == 2: lang_list.append(language) lang_list.append(iso639[language]) elif len(language) >=3: lang_list.append(find_key(iso639,language)) lang_list.append(language) return lang_list # def get_lang_list ends here def parse_index_meta(indexmetaTree): """Function for parsing the 2.0 style of index.metas. """ from lxml import etree import common_functions common_functions.consoleDebug("We are hypermodern. This is index.meta 2.0 with lxml!") indexmeta2dot0 = True # title. Assume (not good) that the main title is also the first one alltitles = indexmetaTree.xpath('/resource/meta/bib/title[not(@type="alternate")]') if len(alltitles) > 1: common_functions.consoleWarn("There are %d different titles! Using first one." % len(alltitles)) title = alltitles[0].text # alternative titles alternative_titles = indexmetaTree.xpath('/resource/meta/bib/title[@type="alternate"]') common_functions.consoleDebug("Number of alternate titles: %s" % len(alternative_titles)) if len(alternative_titles) > 1: common_functions.consoleWarn("There are %d different alternate titles! Using first one." % len(alternative_titles)) alt_title = alternative_titles[0].text if len(alternative_titles) == 1: alt_title = alternative_titles[0].text if len(alternative_titles) == 0: alt_title = "empty" # author. What if there are several ones? allauthors = indexmetaTree.xpath('/resource/meta/bib/person[@role="author"]/name') # common_functions.consoleDebug("hallo. Author. %s" % allauthors[0].text) # if len(allauthors) == 0: # common_functions.consoleDebug("No author given") # initialize two lists author_list = [] # an item per author, should contain name, type of id and id itself i = 0 while i < len(allauthors): author_item = [] # print i author_item.append(allauthors[i].text) try: author_item.append(indexmetaTree.xpath('/resource/meta/bib/person[@role="author"][%d]/identifier/@type' % (1 + i) )[0]) except IndexError: author_item.append("no ID type") common_functions.consoleDebug("hallo. Author. %s" % allauthors[0].text) pass try: author_item.append(indexmetaTree.xpath('/resource/meta/bib/person[@role="author"][%d]/identifier' % (i + 1) )[0].text) except IndexError: author_item.append("no ID") common_functions.consoleDebug("hallo. Author. %s" % allauthors[0].text) pass author_list.append(author_item) i = i + 1 # author_idType = indexmetaTree.xpath('/resource/meta/bib/person[@role="author"]/identifier/@type') # author_idValue = indexmetaTree.xpath('/resource/meta/bib/person[@role="author"]/identifier') # language, we should produce a list here, too, containing both 2 and 3 (or more) letter codes try: lang = indexmetaTree.xpath('/resource/lang')[0].text except IndexError: try: lang = indexmetaTree.xpath('/resource/meta/lang')[0].text except IndexError: try: lang = indexmetaTree.xpath('/resource/meta/bib/title/@lang')[0] except IndexError: common_functions.consoleWarn('Could not find any language information.') lang = "" pass lang_list = get_lang_list(lang) # date date = indexmetaTree.xpath('/resource/meta/bib/date')[0].text # rights rights = indexmetaTree.xpath('/resource/meta/access-conditions/access/@type')[0] # dri identifier MPIWG:WBGMR64C dri = indexmetaTree.xpath('/resource/meta/dri[@type="mpiwg"]')[0].text # insert the creator by using this function # get_gnd(creator) # and export this list which can then be inserted anyway you want # metadata_list = (lang, title, date, creator_list, rights) metadata_list = (lang_list, title, date, author_list, rights, indexmeta2dot0, alt_title, dri) return metadata_list # def parse_index_meta ends here def parse_old_index_meta(indexmetaTree): """Function for parsing the 1.2 style of index.metas. Or version 2.0 without it being stated""" from lxml import etree import common_functions common_functions.consoleDebug('This seems to be an older index.meta. If not, please put version="2.0" in the root tag.') indexmeta2dot0 = False # title title = indexmetaTree.xpath('/resource/meta/bib/title')[0].text # just in case alt_title = "empty" # author. What if there are several ones? author = indexmetaTree.xpath('/resource/meta/bib/author')[0].text # language # lang should be outside of bib, but this not necessarily so if len(indexmetaTree.xpath('/resource/meta/lang')) == 0: common_functions.consoleDebug('No lang tag found in resource/meta.') lang = indexmetaTree.xpath('/resource/meta/bib/lang')[0].text lang_list = get_lang_list(lang) # date date = indexmetaTree.xpath('/resource/meta/bib/year')[0].text # rights rights = indexmetaTree.xpath('/resource/meta/access-conditions/access/@type')[0] # dri identifier MPIWG:WBGMR64C dri = indexmetaTree.xpath('/resource/meta/dri[@type="mpiwg"]')[0].text # insert the creator by using this function author_list = get_gnd(author) # and export this list which can then be inserted anyway you want metadata_list = (lang_list, title, date, author_list, rights, indexmeta2dot0,alt_title,dri) return metadata_list # def parse_old_index_meta ends here def get_pageimg(echopath): """We get the names of the image files also via http, so it is not necessary to mount Foxridge. The return is a list containing two lists. The first list contains the filenames, the second list contains the corresponding index.""" import urllib import sys from lxml import etree import common_functions # parse index.meta # first, we need the real path to the images, because, it is not necessarily called pageimg repo = "http://content.mpiwg-berlin.mpg.de/mpiwg/online/" # repo = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=/" fullpath = "%s%s/index.meta" % (repo, echopath) common_functions.consoleStatus("Trying to get metadata.") # Get the file and read it try: usock=urllib.urlopen(fullpath) except IOError as (errno, strerror): common_functions.printlnInERROR("I/O error ({0}): {1}".format(errno,strerror)) sys.exit() try: indexmetaTree = etree.parse(usock) except etree.XMLSyntaxError: common_functions.printlnInERROR("Online resource %s not found. Keep in mind that you must specify the whole path, e. g. /permanent/library/UR271U6Y." % fullpath) sys.exit() usock.close() # first, we need to check the version (and hopefully, the version of index.meta has been written into the root tag) version = indexmetaTree.xpath('/resource/@version') # common_functions.Debug('Version %s.' % version) if len(indexmetaTree.xpath('/resource/meta/texttool/image')) == 0: common_functions.consoleWarn('There seems to be no figure tag in the index meta. Using "pageimg" as default. Possibly, the whole "texttool" entry is missing.') pageimg_path = "pageimg" else: pageimg_path = indexmetaTree.xpath('/resource/meta/texttool/image')[0].text common_functions.consoleDebug('Path to pageimages: %s.' % pageimg_path) pageimg = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/api/dirInfo-xml.jsp?fn=/%s/%s/" % (echopath,pageimg_path) common_functions.consoleDebug(pageimg) # end of indexmeta parsing # now for the real stuff from lxml import etree return_list = [] # Same as above, parsing the xml. We have some debugging # information here, because sometimes communicating with the # digilib service fails. common_functions.consoleDebug("usock=urllib.urlopen(pageimg)") try: usock=urllib.urlopen(pageimg) except IOError as (errno, strerror): common_functions.printlnInERROR("I/O error ({0}): {1}".format(errno,strerror)) sys.exit() common_functions.consoleStatus("Getting pageimages from %s ..." % echopath) piTree = etree.parse(usock) usock.close() # Testing if the new solution works as well. First the old one # page_list = [] # files = piTree.xpath('/dir/file/fsname') # for item in files: # page_list.append(item.text) # index_list = [] # indices = piTree.xpath('/dir/file/index') # for item in indices: # index_list.append(item.text) # This is the new one page_list = piTree.xpath('/dir/file/fsname') index_list = piTree.xpath('/dir/file/index') # common_functions.consoleDebug("pagelist %s" % page_list) return_list.append(page_list) return_list.append(index_list) return return_list # def get_pageimg ends here def get_id(echopath): """Parses the echopath and returns the identifier""" import string paths = string.split(echopath,"/") if len(paths[-1]) == 0: echoid = paths[-2] else: echoid = paths[-1] return echoid # def get_id ends here def get_unknown(echoorig): """Opens a subprocess to call WS's Check Unknown characters. We put this in, because TextWrangler's output is garbled at the moment. """ import subprocess import common_functions global unknown_uni common_functions.consoleStatus("Checking text for unknown characters.") # executing perl script perl_filter = subprocess.Popen(["perl","Filter_2_04_check_unknown_characters.pl",echoorig],stdout=subprocess.PIPE,stderr=subprocess.PIPE) # Whatever this is for stdout,stderr = perl_filter.communicate() # Solution! # seems to work if we decode this to unicode, solution is from http://farmdev.com/talks/unicode/ unknown_uni = stderr.decode('utf-8') # def get_unknown ends here.