view zopePubmanConnector.py @ 24:345dd913f520 default tip

new pubman
author Dirk Wintergr?n <dwinter@mpiwg-berlin.mpg.de>
date Fri, 10 Jan 2014 12:43:43 +0100
parents d24a8673d68e
children
line wrap: on
line source

# -*- coding: utf-8 -*-

#Verbindet Zope mit pubman. 


from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile
import os.path

from Globals import package_home
import httplib2
import xml.etree.ElementTree as ET
import logging
import time
import unicodedata

TIMEOUT=10

cacheFolder ="/var/tmp/.cacheWWW"

ns = {'escidocMetadataProfile':"http://escidoc.mpg.de/metadataprofile/schema/0.1/",
                  'escidocMetadataRecords':"http://www.escidoc.de/schemas/metadatarecords/0.4",
                  'dc':'http://purl.org/dc/elements/1.1/',
                  'escidocComponents':'http://www.escidoc.de/schemas/components/0.8',
                  'escidocItem':'http://www.escidoc.de/schemas/item/0.8',
                  'srel':'http://escidoc.de/core/01/structural-relations/',
            }
        
           

def zptFile(self, path, orphaned=False):
    """returns a page template file from the product"""
    if orphaned:
        # unusual case
        pt=PageTemplateFile(os.path.join(package_home(globals()), path))
    else:
      
            pt=PageTemplateFile(os.path.join(package_home(globals()), path)).__of__(self)
    return pt

class ZopePubmanConnector(SimpleItem):
    
    
    #connectorString="http://pubman.mpiwg-berlin.mpg.de/search/SearchAndExport?"
    
    
    meta_type="ZopePubmanConnector"
   
    manage_options= ({'label':'Main Config','action': 'changeMain'},) + SimpleItem.manage_options
    
    def __init__(self,id,title,pubmanURL):
        self.id=id 
        self.title=title
        self.pubmanURL=pubmanURL #URL einer pubman instance bzw. einer collection, falls nicht die default collection benutzt werden soll
        
    

    def changeMain(self,pubmanURL=None,title=None,REQUEST=None,RESPONSE=None):
        """change main settings"""
        if pubmanURL:
            self.pubmanURL=pubmanURL
            self.title=title  
            
            if RESPONSE is not None:
                RESPONSE.redirect('manage_main')
     
     
        else:
            pt=zptFile(self, 'zpt/ChangeZopePubmanConnector.zpt')
            return pt()
        
        
    def getPublications(self,personID,limit=None,publicationType=None):
        """get all publications der personID"""
        h = httplib2.Http(cacheFolder,timeout=TIMEOUT)
        
        if publicationType is None:
           # cn = self.connectorString+"cqlQuery=escidoc.any-identifier=%22"+personID+"%22&"
            cn = self.pubmanURL+"cqlQuery=escidoc.publication.creator.person.identifier=%22"+personID+"%22&"
        else:
            #cn = self.connectorString+"cqlQuery=escidoc.any-identifier=%22"+personID+"%22"
            cn = self.pubmanURL+"cqlQuery=%28escidoc.publication.creator.person.identifier=%22"+personID+"%22%29"
            cn +="%20and%28%20escidoc.publication.type=%22"+publicationType+"%22%29&"
        
        cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending"
        if limit:
            cn+="&maximumRecords=%s"%limit
  
        logging.debug(cn)
        try:
            resp, content = h.request(cn)
    
            ET.register_namespace("dcterms", "http://purl.org/dc/terms/")
        
            root = ET.fromstring(content)

        except Exception, e:
            logging.error("Error getting and parsing data from PubMan: %s"%e)
            return []
        
        #<escidocItem:item objid="escidoc:630782" 
        
        citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation"
        
        objxpath=".//{http://www.escidoc.de/schemas/item/0.8}item"
        
        
        
        citations=root.findall(objxpath)
        logging.debug(len(citations))
        ret=[]
        for citation in citations:
            objId = citation.get('objid')
           
            text = citation.find(citationxpath)
  
            idTermPath =""".//escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/escidocMetadataProfile:publication/dc:identifier"""
            #idTermPath =".//{http://purl.org/dc/elements/1.1/}identifier"
  
            idterms = citation.findall(idTermPath,ns)
      
            linksIdentifier=[]
            linksLocator=[]
          
             
            bookID = None
           
            
            for idterm in idterms:
                if idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:OTHER','eidt:OTHER']: ##suche nach bookID
                    logging.debug("zopePubmanConnector: %s"%idterm.text)
                    checkID =idterm.text.lstrip().rstrip()
                    if checkID.startswith("MPIWG-Book:"):
                        bookID = checkID
                        break
                elif idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:URI','eidt:URI']:
                    linksIdentifier.append(idterm.text.lstrip().rstrip())
          
          
          
    
            componentsPath =""".//escidocComponents:components[1]"""
            
            components=citation.findall(componentsPath,ns);
            
            for component in components:
                cnt = component.find(".//escidocComponents:content",ns)
                if cnt is not None:
                    link=""
                    title=""
                    type="" 
                    for name,value in cnt.items():
                        if name.endswith("href"):
                            link=value
                        elif name.endswith("title"):
                            title=value
                        elif name.endswith("storage"):
                            type=value
                            
                    linksLocator.append((title,link,type))
                    
                            
           
           
            ret.append((objId,text.text,bookID,linksIdentifier,linksLocator))

        
        
        return ret
    
    
    def search(self,values={},exact=False,limit=None,contexts=None,resultWithContext=False,sortKeys="escidoc.any-dates"):
        """search pubman
        @values map mit field->value
        @return map mit escidocId -> XML-formatted snippeds
        """
        
        fieldToEscidoc={"title":"escidoc.any-title",
                        "author":"escidoc.publication.any.publication-creator-names",
                        "any":"escidoc.metadata"}
        
             
        cn = self.pubmanURL+"cqlQuery=%s&"
        #cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending"
        #cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.property.creation-date&sortOrder=descending"
        cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys="+sortKeys+"&sortOrder=descending"
        
        if limit:
            cn+="&maximumRecords=%s"%limit
       
        querys = []
        for field in values.keys():
        
            searchField = fieldToEscidoc.get(field,None)
            if searchField is None:
                logging.debug("search, don't know field: %s"%field)
                continue
            
            value = values[field]
            try:
                value=unicodedata.normalize('NFKD', value).encode('ASCII', 'ignore')
            except:
                value=unicodedata.normalize('NFKD', value.decode('utf-8')).encode('ASCII', 'ignore')
            if value == '':
                continue
            logging.debug("%s=%s"%(field,value))
            if not exact:
                value=value+"*"
        
            querys.append("%s=%%22%s%%22"%(searchField,value))
        
        query="%20AND%20".join(querys)
        
        if contexts: # einscbraenken auf contexte
            
            if isinstance(contexts, str):
                contexts=[contexts]
            
            ctxquerys=[]
            for context in contexts:
                ctxquerys.append("escidoc.context.objid=%%22%s%%22"%(context))
                
            ctxquery="%20OR%20".join(ctxquerys)
            
            if query!="":
                query=query+"AND%%20(%s)"%ctxquery
            else:
                query="(%s)"%ctxquery
                
        try:
            h = httplib2.Http(cacheFolder,timeout=TIMEOUT)
            logging.debug("search: "+cn%query)
            resp, content = h.request(cn%query)
        except:
            logging.error("Unable to get data from PubMan!")
            return {}
       
        ET.register_namespace("dcterms", "http://purl.org/dc/terms/")
        
        try:
            root = ET.fromstring(content)
        except:
            logging.error("Couldn't parse content of:%s"%(cn%query))
            return {}
        #<escidocItem:item objid="escidoc:630782" 
        
        citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation"
        
        objxpath=".//{http://www.escidoc.de/schemas/item/0.8}item"
        citations=root.findall(objxpath)
        
        ret={}
        for citation in citations:
            objId = citation.get('objid')
            text = citation.find(citationxpath)
            
            if resultWithContext:
                ctxPath=".//escidocItem:properties/srel:context"
                ctx = citation.find(ctxPath,ns)
                ret[objId]=(text.text,ctx.get('objid'))     
            else:         
                ret[objId]=text.text
        
        return ret
        
       
    def getEntriesFromPubman(self,escidocids):
        
        doctypes={}
        for escidocid in escidocids:
          
            txt, type, bookID,linksIdentifier,linksLocator = self.getEntryFromPubman(escidocid.escidocid, True)
           
            if not doctypes.has_key(type):
                doctypes[type]=[]
               
            entry={}
            entry['citation']= txt
            entry['escidocId']= escidocid.escidocid
            entry['bookId']=bookID
            entry['linksIdentifier']=linksIdentifier
            entry['linksLocator']=linksIdentifier
            doctypes[type].append(entry)
            
        
        return doctypes
      
      
    def getEntryFromPubman(self,escidocid,extendedData=None,withContext=False):
        """get one entry"""
        
           
    
        escidocid=escidocid.lstrip().strip()
        h = httplib2.Http(cacheFolder,timeout=TIMEOUT)
        cn = self.pubmanURL+"cqlQuery=escidoc.objid=%s&"
        cn +="exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending"
        content = None
        try:
            resp, content = h.request(cn%escidocid)
            ET.register_namespace("dcterms", "http://purl.org/dc/terms/")
            logging.debug(cn%escidocid)
            root = ET.fromstring(content)
        except:
            logging.error("zopePubmanConnector: cannot parse: %s"%content)
            return "",""
        
        citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation"
        
        itempath = ".//escidocItem:item"

        
        item = root.find(itempath,ns) #get item
        
        if item is  None:
            logging.error("pubman connector: cannot find %s"%escidocid)          
            return escidocid,"","","",""
        
        citation=item.find(citationxpath,ns)
        
   
        if citation is not None and extendedData is not None:
            
            linksIdentifier=[]
            linksLocator=[]
          
        
        
            #get identifier
            idTermPath =""".//escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/escidocMetadataProfile:publication/dc:identifier"""
            #idTermPath =".//{http://purl.org/dc/elements/1.1/}identifier"
  
            idterms = item.findall(idTermPath,ns)
      
            bookID = None
            logging.debug("zopePubmanConnector: %s"%idterms)
            for idterm in idterms:
               
                if idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:OTHER','eidt:OTHER']: ##suche nach bookID
                    logging.debug("zopePubmanConnector: %s"%idterm.text)
                    checkID =idterm.text.lstrip().rstrip()
                    if checkID.startswith("MPIWG-Book:"):
                        bookID = checkID
                        break
                elif idterm.get("{http://www.w3.org/2001/XMLSchema-instance}type",'') in ['eterms:URI','eidt:URI']:
                    linksIdentifier.append(idterm.text.lstrip().rstrip())
          
                
            #get files and locators
            componentsPath =""".//escidocComponents:components[1]"""
            
            components=item.findall(componentsPath,ns);
            
            for component in components:
                cnt = component.find(".//escidocComponents:content",ns)
                if cnt is not None:
                    link=""
                    title=""
                    type="" 
                    for name,value in cnt.items():
                        if name.endswith("href"):
                            link=value
                        elif name.endswith("title"):
                            title=value
                        elif name.endswith("storage"):
                            type=value
                            
                    linksLocator.append((title,link,type))
                    
                            
                    
                            
                            
                       
                        
            
          
            path = ".//escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/escidocMetadataProfile:publication"
            publicationTag= item.find(path,ns);
            
            return citation.text,publicationTag.get('type'),bookID,linksIdentifier,linksLocator
        
        
        if citation is not None and withContext:
            ctxPath=".//escidocItem:properties/srel:context"
            ctx = item.find(ctxPath,ns)
            
            return citation.text,ctx.get('objid')
            
        if citation is not None:

            return citation.text
        
        
        
        
        return "",''
        
    def pubmanConnectorURL(self):
        return self.pubmanURL
    
    
    def getPublicationsFromContext(self,context,limit=None,publicationType=None,search=None):
        """gibt alle publicationen des context, jeweils als tupel ("escidoc:id",METADATEN)
        
        METADATEN ist hierbei eine Map mit :
        "citation" --> citation in der APA formatierung
        "volume" --> volume
        "link" --> dowloadlink
        "abstracts" --> map mit deu/eng für den abstrakt
        "authors" --> [(NACHNAME,VORNAME]),..]
        "title"--> title
        "year" --> issued
        """
        h = httplib2.Http(cacheFolder,timeout=TIMEOUT)
        
        if publicationType is None:
            cn = self.pubmanURL+"cqlQuery=(escidoc.context.objid=%22"+context+"%22"
            #cn = self.pubmanURL+"cqlQuery=escidoc.objid=%22"+"escidoc:643455"+"%22&"
        else:
            cn = self.pubmanURL+"cqlQuery=(escidoc.context.objid=%22"+context+"%22"
            cn +="%20and%20escidoc.publication.type=%22"+publicationType+"%22"
        
        if search is not None and search != "":
            try:
                search = unicodedata.normalize('NFKD', search).encode('ASCII', 'ignore')
            except:
                search = unicodedata.normalize('NFKD', search.decode('utf-8')).encode('ASCII', 'ignore')
            cn+="%20and%20escidoc.metadata="+search+""
        
        
        cn +=")&exportFormat=APA&outputFormat=snippet&language=all&sortKeys=escidoc.any-dates&sortOrder=descending"
        if limit:
            cn+="&maximumRecords=%s"%limit
  
        startTime = time.time()
        try:
            logging.debug("getPublicationsFromContext: getting %s"%cn)
            resp, content = h.request(cn)
            logging.debug("getPublicationsFromContext: got data in %ss"%(time.time()-startTime))
     
            ET.register_namespace("dcterms", "http://purl.org/dc/terms/")

            root = ET.fromstring(content)
            
        except Exception, e:
            logging.error("Unable to read and parse data! %s"%e)
            return []

        #<escidocItem:item objid="escidoc:630782" 
        
        citationxpath=".//{http://purl.org/dc/terms/}bibliographicCitation"
        abstractpath=".//{http://purl.org/dc/terms/}abstract"
        issuedpath=".//{http://purl.org/dc/terms/}issued"
        
        creatorpath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/publication}creator/{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}person"
        familyNamepath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}family-name"
        givenNamepath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}given-name"
        
        
        titlepath=".//{http://purl.org/dc/elements/1.1/}title"
        
        objxpath=".//{http://www.escidoc.de/schemas/item/0.8}item"
        srcpath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/publication}source"
        volumepath=".//{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}volume"
        
        linkspath=""".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}content[@storage="internal-managed"]"""
        visibility=""".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}properties/{http://escidoc.de/core/01/properties/}visibility"""
        #linkspath=""".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}content[@storage="external-url"]"""
        #linkspath=".//{http://www.escidoc.de/schemas/components/0.8}component/{http://www.escidoc.de/schemas/components/0.8}content"
        citations=root.findall(objxpath)
        
        ret=[]
        for citation in citations:
            objId = citation.get('objid')
            
            text = citation.find(citationxpath)
            
            #Get volume = preprintID
            #  <publication:source type="series">
            #      <dc:title>Max-Planck-Institut für Wissenschaftsgeschichte : Preprint</dc:title>
            #      <escidoc:volume>437</escidoc:volume>

            src= citation.find(srcpath)
            vol = src.find(volumepath)
            
            #get link to fulltext
            #<escidocComponents:component objid="escidoc:644183">
            #<escidocComponents:properties>
            #   <prop:creation-date>2013-04-29T09:00:01.100Z</prop:creation-date>
            #   <prop:valid-status>valid</prop:valid-status>
            #   <prop:visibility>public</prop:visibility>
            #   <prop:content-category>pre-print</prop:content-category>
            #   <prop:file-name>P437.PDF</prop:file-name>
            #   <prop:mime-type>application/pdf</prop:mime-type>
             #  <prop:checksum>d0ccdc62d6707d934e60e9839ffe30bf</prop:checksum>
            #   <prop:checksum-algorithm>MD5</prop:checksum-algorithm>
            #</escidocComponents:properties>
            #<escidocComponents:content xlink:type="simple" xlink:title="P437.PDF" storage="internal-managed"
            #   xlink:href="http://pubman.mpiwg-berlin.mpg.de/pubman/item/escidoc:643686:3/component/escidoc:644183/P437.PDF"/>
            #

            vis= citation.find(visibility)
            
            visText=""
            if vis is not None:
                visText =vis.text
            
            
            print vis
            scr=None
            
            if visText != "private":
                src= citation.find(linkspath)
            if src is not None:
                
                link=src.get("{http://www.w3.org/1999/xlink}href")
                #logging.debug(src.attrib) 
            
            else:
                link =""
            
            #<dcterms:abstract xml:lang="deu">Dieser Preprint versammelt eine Auswahl von Beiträgen zum Symposium zu Ehren von Hans-Jörg Rheinbergers 65. Geburtstag. Es fand am 24.1.2011 im Max-Planck-Institute für Wissenschaftsgeschichte statt und brachte Freunde, Studenten und Kollegen von Hans-Jörg Rheinberger zusammen.</dcterms:abstract>
            #<dcterms:abstract xml:lang="eng">In this preprint, a selection of contributions to the symposium in honor of Hans-Jörg Rheinberger’s 65th birthday is published. It took place on January 24, 2011 at the Max-Planck-Institute for the History of Science and assembled friends, students and colleagues of Hans-Jörg Rheinberger.</dcterms:abstract>
            
            abstracts = citation.findall(abstractpath)
            
            abstractTexts={}
            for abstract in abstracts:
                
                lang = abstract.get("{http://www.w3.org/XML/1998/namespace}lang")
                abstractTexts[lang]=abstract.text
                 
            authorsTags = citation.findall(creatorpath)
          
            authors=[]
            for author in authorsTags:
              
                gn= author.find(givenNamepath).text
                fn= author.find(familyNamepath).text
                
                authors.append((fn,gn))
                
            titleTag = citation.find(titlepath)

            if titleTag is not None:
                title = titleTag.text
            else:
                title=""
         
            issuedTag = citation.find(issuedpath)
                
            if issuedTag is not None:
                issued = issuedTag.text
            else:
                issued=""
            
            item = {"id":objId,
                    "citation":text.text,
                    "volume":vol.text,
                    "link":link,
                    "abstracts":abstractTexts,
                    "authors":authors,
                    "title":title,
                    "year":issued}

            ret.append(item)

        logging.debug("getPublicationsFromContext: done in %ss"%(time.time()-startTime))
        return ret
    
    
        
        
def manage_addZopePubmanConnectorForm(self):
        """Form for external Links"""
        pt=zptFile(self, 'zpt/AddZopePubmanConnector.zpt')
        return pt()


def manage_addZopePubmanConnector(self,id,title,pubmanURL,RESPONSE=None):
    """Add an external Link"""

    newObj=ZopePubmanConnector(id,title,pubmanURL)

    self._setObject(id,newObj)
  
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')