Mercurial > hg > ZopePubmanConnector
view zopePubmanConnector.py @ 24:345dd913f520 default tip
new pubman
author | Dirk Wintergr?n <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 10 Jan 2014 12:43:43 +0100 |
parents | d24a8673d68e |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- #Verbindet Zope mit pubman. from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile import os.path from Globals import package_home import httplib2 import xml.etree.ElementTree as ET import logging import time import unicodedata TIMEOUT=10 cacheFolder ="/var/tmp/.cacheWWW" ns = {'escidocMetadataProfile':"http://escidoc.mpg.de/metadataprofile/schema/0.1/", 'escidocMetadataRecords':"http://www.escidoc.de/schemas/metadatarecords/0.4", 'dc':'http://purl.org/dc/elements/1.1/', 'escidocComponents':'http://www.escidoc.de/schemas/components/0.8', 'escidocItem':'http://www.escidoc.de/schemas/item/0.8', 'srel':'http://escidoc.de/core/01/structural-relations/', } def zptFile(self, path, orphaned=False): """returns a page template file from the product""" if orphaned: # unusual case pt=PageTemplateFile(os.path.join(package_home(globals()), path)) else: pt=PageTemplateFile(os.path.join(package_home(globals()), path)).__of__(self) return pt class ZopePubmanConnector(SimpleItem): #connectorString="http://pubman.mpiwg-berlin.mpg.de/search/SearchAndExport?" meta_type="ZopePubmanConnector" manage_options= ({'label':'Main Config','action': 'changeMain'},) + SimpleItem.manage_options def __init__(self,id,title,pubmanURL): self.id=id self.title=title self.pubmanURL=pubmanURL #URL einer pubman instance bzw. einer collection, falls nicht die default collection benutzt werden soll def changeMain(self,pubmanURL=None,title=None,REQUEST=None,RESPONSE=None): """change main settings""" if pubmanURL: self.pubmanURL=pubmanURL self.title=title if RESPONSE is not None: RESPONSE.redirect('manage_main') else: pt=zptFile(self, 'zpt/ChangeZopePubmanConnector.zpt') return pt() def getPublications(self,personID,limit=None,publicationType=None): """get all publications der personID""" h = httplib2.Http(cacheFolder,timeout=TIMEOUT) if publicationType is None: # cn = self.connectorString+"cqlQuery=escidoc.any-identifier=%22"+personID+"%22&" cn = self.pubmanURL+"cqlQuery=escidoc.publication.creator.person.identifier=%22"+personID+"%22&" else: #cn = self.connectorString+"cqlQuery=escidoc.any-identifier=%22"+personID+"%22" cn = self.pubmanURL+"cqlQuery=%28escidoc.publication.creator.person.identifier=%22"+personID+"%22%29" cn +="%20and%28%20escidoc.publication.type=%22"+publicationType+"%22%29&" cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending" if limit: cn+="&maximumRecords=%s"%limit logging.debug(cn) try: resp, content = h.request(cn) ET.register_namespace("dcterms", "http://purl.org/dc/terms/") root = ET.fromstring(content) except Exception, e: logging.error("Error getting and parsing data from PubMan: %s"%e) return [] #<escidocItem:item objid="escidoc:630782" citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation" objxpath=".//{http://www.escidoc.de/schemas/item/0.8}item" citations=root.findall(objxpath) logging.debug(len(citations)) ret=[] for citation in citations: objId = citation.get('objid') text = citation.find(citationxpath) idTermPath =""".//escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/escidocMetadataProfile:publication/dc:identifier""" #idTermPath =".//{http://purl.org/dc/elements/1.1/}identifier" idterms = citation.findall(idTermPath,ns) linksIdentifier=[] linksLocator=[] bookID = None for idterm in idterms: if idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:OTHER','eidt:OTHER']: ##suche nach bookID logging.debug("zopePubmanConnector: %s"%idterm.text) checkID =idterm.text.lstrip().rstrip() if checkID.startswith("MPIWG-Book:"): bookID = checkID break elif idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:URI','eidt:URI']: linksIdentifier.append(idterm.text.lstrip().rstrip()) componentsPath =""".//escidocComponents:components[1]""" components=citation.findall(componentsPath,ns); for component in components: cnt = component.find(".//escidocComponents:content",ns) if cnt is not None: link="" title="" type="" for name,value in cnt.items(): if name.endswith("href"): link=value elif name.endswith("title"): title=value elif name.endswith("storage"): type=value linksLocator.append((title,link,type)) ret.append((objId,text.text,bookID,linksIdentifier,linksLocator)) return ret def search(self,values={},exact=False,limit=None,contexts=None,resultWithContext=False,sortKeys="escidoc.any-dates"): """search pubman @values map mit field->value @return map mit escidocId -> XML-formatted snippeds """ fieldToEscidoc={"title":"escidoc.any-title", "author":"escidoc.publication.any.publication-creator-names", "any":"escidoc.metadata"} cn = self.pubmanURL+"cqlQuery=%s&" #cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending" #cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.property.creation-date&sortOrder=descending" cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys="+sortKeys+"&sortOrder=descending" if limit: cn+="&maximumRecords=%s"%limit querys = [] for field in values.keys(): searchField = fieldToEscidoc.get(field,None) if searchField is None: logging.debug("search, don't know field: %s"%field) continue value = values[field] try: value=unicodedata.normalize('NFKD', value).encode('ASCII', 'ignore') except: value=unicodedata.normalize('NFKD', value.decode('utf-8')).encode('ASCII', 'ignore') if value == '': continue logging.debug("%s=%s"%(field,value)) if not exact: value=value+"*" querys.append("%s=%%22%s%%22"%(searchField,value)) query="%20AND%20".join(querys) if contexts: # einscbraenken auf contexte if isinstance(contexts, str): contexts=[contexts] ctxquerys=[] for context in contexts: ctxquerys.append("escidoc.context.objid=%%22%s%%22"%(context)) ctxquery="%20OR%20".join(ctxquerys) if query!="": query=query+"AND%%20(%s)"%ctxquery else: query="(%s)"%ctxquery try: h = httplib2.Http(cacheFolder,timeout=TIMEOUT) logging.debug("search: "+cn%query) resp, content = h.request(cn%query) except: logging.error("Unable to get data from PubMan!") return {} ET.register_namespace("dcterms", "http://purl.org/dc/terms/") try: root = ET.fromstring(content) except: logging.error("Couldn't parse content of:%s"%(cn%query)) return {} #<escidocItem:item objid="escidoc:630782" citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation" objxpath=".//{http://www.escidoc.de/schemas/item/0.8}item" citations=root.findall(objxpath) ret={} for citation in citations: objId = citation.get('objid') text = citation.find(citationxpath) if resultWithContext: ctxPath=".//escidocItem:properties/srel:context" ctx = citation.find(ctxPath,ns) ret[objId]=(text.text,ctx.get('objid')) else: ret[objId]=text.text return ret def getEntriesFromPubman(self,escidocids): doctypes={} for escidocid in escidocids: txt, type, bookID,linksIdentifier,linksLocator = self.getEntryFromPubman(escidocid.escidocid, True) if not doctypes.has_key(type): doctypes[type]=[] entry={} entry['citation']= txt entry['escidocId']= escidocid.escidocid entry['bookId']=bookID entry['linksIdentifier']=linksIdentifier entry['linksLocator']=linksIdentifier doctypes[type].append(entry) return doctypes def getEntryFromPubman(self,escidocid,extendedData=None,withContext=False): """get one entry""" escidocid=escidocid.lstrip().strip() h = httplib2.Http(cacheFolder,timeout=TIMEOUT) cn = self.pubmanURL+"cqlQuery=escidoc.objid=%s&" cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending" content = None try: resp, content = h.request(cn%escidocid) ET.register_namespace("dcterms", "http://purl.org/dc/terms/") logging.debug(cn%escidocid) root = ET.fromstring(content) except: logging.error("zopePubmanConnector: cannot parse: %s"%content) return "","" citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation" itempath = ".//escidocItem:item" item = root.find(itempath,ns) #get item if item is None: logging.error("pubman connector: cannot find %s"%escidocid) return escidocid,"","","","" citation=item.find(citationxpath,ns) if citation is not None and extendedData is not None: linksIdentifier=[] linksLocator=[] #get identifier idTermPath =""".//escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/escidocMetadataProfile:publication/dc:identifier""" #idTermPath =".//{http://purl.org/dc/elements/1.1/}identifier" idterms = item.findall(idTermPath,ns) bookID = None logging.debug("zopePubmanConnector: %s"%idterms) for idterm in idterms: if idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:OTHER','eidt:OTHER']: ##suche nach bookID logging.debug("zopePubmanConnector: %s"%idterm.text) checkID =idterm.text.lstrip().rstrip() if checkID.startswith("MPIWG-Book:"): bookID = checkID break elif idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:URI','eidt:URI']: linksIdentifier.append(idterm.text.lstrip().rstrip()) #get files and locators componentsPath =""".//escidocComponents:components[1]""" components=item.findall(componentsPath,ns); for component in components: cnt = component.find(".//escidocComponents:content",ns) if cnt is not None: link="" title="" type="" for name,value in cnt.items(): if name.endswith("href"): link=value elif name.endswith("title"): title=value elif name.endswith("storage"): type=value linksLocator.append((title,link,type)) path = ".//escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/escidocMetadataProfile:publication" publicationTag= item.find(path,ns); return citation.text,publicationTag.get('type'),bookID,linksIdentifier,linksLocator if citation is not None and withContext: ctxPath=".//escidocItem:properties/srel:context" ctx = item.find(ctxPath,ns) return citation.text,ctx.get('objid') if citation is not None: return citation.text return "",'' def pubmanConnectorURL(self): return self.pubmanURL def getPublicationsFromContext(self,context,limit=None,publicationType=None,search=None): """gibt alle publicationen des context, jeweils als tupel ("escidoc:id",METADATEN) METADATEN ist hierbei eine Map mit : "citation" --> citation in der APA formatierung "volume" --> volume "link" --> dowloadlink "abstracts" --> map mit deu/eng für den abstrakt "authors" --> [(NACHNAME,VORNAME]),..] "title"--> title "year" --> issued """ h = httplib2.Http(cacheFolder,timeout=TIMEOUT) if publicationType is None: cn = self.pubmanURL+"cqlQuery=(escidoc.context.objid=%22"+context+"%22" #cn = self.pubmanURL+"cqlQuery=escidoc.objid=%22"+"escidoc:643455"+"%22&" else: cn = self.pubmanURL+"cqlQuery=(escidoc.context.objid=%22"+context+"%22" cn +="%20and%20escidoc.publication.type=%22"+publicationType+"%22" if search is not None and search != "": try: search = unicodedata.normalize('NFKD', search).encode('ASCII', 'ignore') except: search = unicodedata.normalize('NFKD', search.decode('utf-8')).encode('ASCII', 'ignore') cn+="%20and%20escidoc.metadata="+search+"" cn +=")&exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending" if limit: cn+="&maximumRecords=%s"%limit startTime = time.time() try: logging.debug("getPublicationsFromContext: getting %s"%cn) resp, content = h.request(cn) logging.debug("getPublicationsFromContext: got data in %ss"%(time.time()-startTime)) ET.register_namespace("dcterms", "http://purl.org/dc/terms/") root = ET.fromstring(content) except Exception, e: logging.error("Unable to read and parse data! %s"%e) return [] #<escidocItem:item objid="escidoc:630782" citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation" abstractpath=".//{http://purl.org/dc/terms/}abstract" issuedpath=".//{http://purl.org/dc/terms/}issued" creatorpath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/publication}creator/{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}person" familyNamepath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}family-name" givenNamepath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}given-name" titlepath=".//{http://purl.org/dc/elements/1.1/}title" objxpath=".//{http://www.escidoc.de/schemas/item/0.8}item" srcpath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/publication}source" volumepath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}volume" linkspath=""".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}content[@storage="internal-managed"]""" visibility=""".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}properties/{http://escidoc.de/core/01/properties/}visibility""" #linkspath=""".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}content[@storage="external-url"]""" #linkspath=".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}content" citations=root.findall(objxpath) ret=[] for citation in citations: objId = citation.get('objid') text = citation.find(citationxpath) #Get volume = preprintID # <publication:source type="series"> # <dc:title>Max-Planck-Institut für Wissenschaftsgeschichte : Preprint</dc:title> # <escidoc:volume>437</escidoc:volume> src= citation.find(srcpath) vol = src.find(volumepath) #get link to fulltext #<escidocComponents:component objid="escidoc:644183"> #<escidocComponents:properties> # <prop:creation-date>2013-04-29T09:00:01.100Z</prop:creation-date> # <prop:valid-status>valid</prop:valid-status> # <prop:visibility>public</prop:visibility> # <prop:content-category>pre-print</prop:content-category> # <prop:file-name>P437.PDF</prop:file-name> # <prop:mime-type>application/pdf</prop:mime-type> # <prop:checksum>d0ccdc62d6707d934e60e9839ffe30bf</prop:checksum> # <prop:checksum-algorithm>MD5</prop:checksum-algorithm> #</escidocComponents:properties> #<escidocComponents:content xlink:type="simple" xlink:title="P437.PDF" storage="internal-managed" # xlink:href="http://pubman.mpiwg-berlin.mpg.de/pubman/item/escidoc:643686:3/component/escidoc:644183/P437.PDF"/> # vis= citation.find(visibility) visText="" if vis is not None: visText =vis.text print vis scr=None if visText != "private": src= citation.find(linkspath) if src is not None: link=src.get("{http://www.w3.org/1999/xlink}href") #logging.debug(src.attrib) else: link ="" #<dcterms:abstract xml:lang="deu">Dieser Preprint versammelt eine Auswahl von Beiträgen zum Symposium zu Ehren von Hans-Jörg Rheinbergers 65. Geburtstag. Es fand am 24.1.2011 im Max-Planck-Institute für Wissenschaftsgeschichte statt und brachte Freunde, Studenten und Kollegen von Hans-Jörg Rheinberger zusammen.</dcterms:abstract> #<dcterms:abstract xml:lang="eng">In this preprint, a selection of contributions to the symposium in honor of Hans-Jörg Rheinberger’s 65th birthday is published. It took place on January 24, 2011 at the Max-Planck-Institute for the History of Science and assembled friends, students and colleagues of Hans-Jörg Rheinberger.</dcterms:abstract> abstracts = citation.findall(abstractpath) abstractTexts={} for abstract in abstracts: lang = abstract.get("{http://www.w3.org/XML/1998/namespace}lang") abstractTexts[lang]=abstract.text authorsTags = citation.findall(creatorpath) authors=[] for author in authorsTags: gn= author.find(givenNamepath).text fn= author.find(familyNamepath).text authors.append((fn,gn)) titleTag = citation.find(titlepath) if titleTag is not None: title = titleTag.text else: title="" issuedTag = citation.find(issuedpath) if issuedTag is not None: issued = issuedTag.text else: issued="" item = {"id":objId, "citation":text.text, "volume":vol.text, "link":link, "abstracts":abstractTexts, "authors":authors, "title":title, "year":issued} ret.append(item) logging.debug("getPublicationsFromContext: done in %ss"%(time.time()-startTime)) return ret def manage_addZopePubmanConnectorForm(self): """Form for external Links""" pt=zptFile(self, 'zpt/AddZopePubmanConnector.zpt') return pt() def manage_addZopePubmanConnector(self,id,title,pubmanURL,RESPONSE=None): """Add an external Link""" newObj=ZopePubmanConnector(id,title,pubmanURL) self._setObject(id,newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')