Changeset 465:224aad394350 in documentViewer
- Timestamp:
- Jul 29, 2011, 6:36:04 PM (14 years ago)
- Branch:
- elementtree
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
MpdlXmlTextServer.py
r458 r465 261 261 docinfo['numTextPages'] = np 262 262 if docinfo.get('numPages', 0) == 0: 263 # seems to be text-only 264 docinfo['num TextPages'] = np263 # seems to be text-only - update page count 264 docinfo['numPages'] = np 265 265 pageinfo['end'] = min(pageinfo['end'], np) 266 266 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) -
documentViewer.py
r464 r465 264 264 265 265 return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode) 266 266 267 267 268 security.declareProtected('View','index_html') … … 294 295 if viewMode=="auto": 295 296 if docinfo.get('textURL', None) or docinfo.get('textURLPath', None): 296 #texturl gesetzt und textViewer konfiguriert297 297 viewMode="text_dict" 298 298 else: … … 390 390 def getInfo_xml(self,url,mode): 391 391 """returns info about the document as XML""" 392 393 392 if not self.digilibBaseUrl: 394 393 self.digilibBaseUrl = self.findDigilibUrl() or "http://nausikaa.mpiwg-berlin.mpg.de/digitallibrary" … … 398 397 return pt(docinfo=docinfo) 399 398 400 def getOptionToggle(self, newState=None, optionName='text_options_open', initialState=True):401 """returns new option state"""402 if not self.REQUEST.SESSION.has_key(optionName):403 # not in session -- initial404 opt = {'lastState': newState, 'state': initialState}405 else:406 opt = self.REQUEST.SESSION.get(optionName)407 if opt['lastState'] != newState:408 # state in session has changed -- toggle409 opt['state'] = not opt['state']410 opt['lastState'] = newState411 412 self.REQUEST.SESSION[optionName] = opt413 return opt['state']414 415 399 def isAccessible(self, docinfo): 416 400 """returns if access to the resource is granted""" 417 401 access = docinfo.get('accessType', None) 418 402 logging.debug("documentViewer (accessOK) access type %s"%access) 419 if access is not None and access== 'free':403 if access == 'free': 420 404 logging.debug("documentViewer (accessOK) access is free") 421 405 return True 406 422 407 elif access is None or access in self.authgroups: 423 408 # only local access -- only logged in users … … 433 418 return False 434 419 435 436 def getDirinfoFromDigilib(self,path,docinfo=None,cut=0):437 """gibt param von dlInfo aus"""438 if docinfo is None:439 docinfo = {}440 441 for x in range(cut):442 path=getParentPath(path)443 444 infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path445 446 logging.debug("documentViewer (getparamfromdigilib) dirInfo from %s"%(infoUrl))447 448 txt = getHttpData(infoUrl)449 if txt is None:450 raise IOError("Unable to get dir-info from %s"%(infoUrl))451 452 dom = ET.fromstring(txt)453 #dom = Parse(txt)454 size=getText(dom.find("size"))455 #sizes=dom.xpath("//dir/size")456 logging.debug("documentViewer (getparamfromdigilib) dirInfo:size=%s"%size)457 458 if size:459 docinfo['numPages'] = int(size)460 else:461 docinfo['numPages'] = 0462 463 # TODO: produce and keep list of image names and numbers464 465 return docinfo466 467 def getIndexMetaPath(self,url):468 """gib nur den Pfad zurueck"""469 regexp = re.compile(r".*(experimental|permanent)/(.*)")470 regpath = regexp.match(url)471 if (regpath==None):472 return ""473 logging.debug("(getDomFromIndexMeta): URLXAXA: %s"%regpath.group(2))474 return ("/mpiwg/online/"+regpath.group(1)+"/"+regpath.group(2))475 476 477 478 def getIndexMetaUrl(self,url):479 """returns utr of index.meta document at url"""480 481 metaUrl = None482 if url.startswith("http://"):483 # real URL484 metaUrl = url485 else:486 # online path487 server=self.digilibBaseUrl+"/servlet/Texter?fn="488 metaUrl=server+url.replace("/mpiwg/online","")489 if not metaUrl.endswith("index.meta"):490 metaUrl += "/index.meta"491 492 return metaUrl493 494 def getDomFromIndexMeta(self, url):495 """get dom from index meta"""496 dom = None497 metaUrl = self.getIndexMetaUrl(url)498 499 logging.debug("(getDomFromIndexMeta): METAURL: %s"%metaUrl)500 txt=getHttpData(metaUrl)501 if txt is None:502 raise IOError("Unable to read index meta from %s"%(url))503 504 dom = ET.fromstring(txt)505 #dom = Parse(txt)506 return dom507 508 def getPresentationInfoXML(self, url):509 """returns dom of info.xml document at url"""510 dom = None511 metaUrl = None512 if url.startswith("http://"):513 # real URL514 metaUrl = url515 else:516 # online path517 server=self.digilibBaseUrl+"/servlet/Texter?fn="518 metaUrl=server+url.replace("/mpiwg/online","")519 520 txt=getHttpData(metaUrl)521 if txt is None:522 raise IOError("Unable to read infoXMLfrom %s"%(url))523 524 dom = ET.fromstring(txt)525 #dom = Parse(txt)526 return dom527 528 529 def getAuthinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):530 """gets authorization info from the index.meta file at path or given by dom"""531 logging.debug("documentViewer (getauthinfofromindexmeta) path: %s"%(path))532 533 access = None534 535 if docinfo is None:536 docinfo = {}537 538 if dom is None:539 for x in range(cut):540 path=getParentPath(path)541 dom = self.getDomFromIndexMeta(path)542 543 acc = dom.find(".//access-conditions/access")544 if acc is not None:545 acctype = acc.get('type')546 #acctype = dom.xpath("//access-conditions/access/@type")547 if acctype:548 access=acctype549 if access in ['group', 'institution']:550 access = dom.find(".//access-conditions/access/name").text.lower()551 552 docinfo['accessType'] = access553 return docinfo554 555 556 def getBibinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):557 """gets bibliographical info from the index.meta file at path or given by dom"""558 logging.debug("documentViewer (getbibinfofromindexmeta) path: %s"%(path))559 560 if docinfo is None:561 docinfo = {}562 563 if dom is None:564 for x in range(cut):565 path=getParentDir(path)566 dom = self.getDomFromIndexMeta(path)567 568 docinfo['indexMetaPath']=self.getIndexMetaPath(path);569 570 logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path))571 if self.metadataService is not None:572 # put all raw bib fields in dict "bib"573 bib = self.metadataService.getBibData(dom=dom)574 docinfo['bib'] = bib575 bibtype = bib.get('@type', None)576 docinfo['bib_type'] = bibtype577 # also store DC metadata for convenience578 dc = self.metadataService.getDCMappedData(bib)579 docinfo['creator'] = dc.get('creator',None)580 docinfo['title'] = dc.get('title',None)581 docinfo['date'] = dc.get('date',None)582 else:583 logging.error("MetadataService not found!")584 return docinfo585 586 587 # TODO: is this needed?588 def getNameFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):589 """gets name info from the index.meta file at path or given by dom"""590 if docinfo is None:591 docinfo = {}592 593 if dom is None:594 for x in range(cut):595 path=getParentPath(path)596 dom = self.getDomFromIndexMeta(path)597 598 docinfo['name']=getText(dom.find("name"))599 logging.debug("documentViewer docinfo[name] %s"%docinfo['name'])600 return docinfo601 602 603 def getDocinfoFromTextTool(self, url, dom=None, docinfo=None):604 """parse texttool tag in index meta"""605 logging.debug("documentViewer (getdocinfofromtexttool) url: %s" % (url))606 if docinfo is None:607 docinfo = {}608 if docinfo.get('lang', None) is None:609 docinfo['lang'] = '' # default keine Sprache gesetzt610 if dom is None:611 dom = self.getDomFromIndexMeta(url)612 613 texttool = self.metadata.getTexttoolData(dom=dom)614 615 archivePath = None616 archiveName = None617 618 archiveName = getText(dom.find("name"))619 if not archiveName:620 logging.warning("documentViewer (getdocinfofromtexttool) resource/name missing in: %s" % (url))621 622 archivePath = getText(dom.find("archive-path"))623 if archivePath:624 # clean up archive path625 if archivePath[0] != '/':626 archivePath = '/' + archivePath627 if archiveName and (not archivePath.endswith(archiveName)):628 archivePath += "/" + archiveName629 else:630 # try to get archive-path from url631 logging.warning("documentViewer (getdocinfofromtexttool) resource/archive-path missing in: %s" % (url))632 if (not url.startswith('http')):633 archivePath = url.replace('index.meta', '')634 635 if archivePath is None:636 # we balk without archive-path637 raise IOError("Missing archive-path (for text-tool) in %s" % (url))638 639 imageDir = texttool.get('image', None)640 641 if not imageDir:642 # we balk with no image tag / not necessary anymore because textmode is now standard643 #raise IOError("No text-tool info in %s"%(url))644 imageDir = ""645 #xquery="//pb"646 docinfo['imagePath'] = "" # keine Bilder647 docinfo['imageURL'] = ""648 649 if imageDir and archivePath:650 #print "image: ", imageDir, " archivepath: ", archivePath651 imageDir = os.path.join(archivePath, imageDir)652 imageDir = imageDir.replace("/mpiwg/online", '')653 docinfo = self.getDirinfoFromDigilib(imageDir, docinfo=docinfo)654 docinfo['imagePath'] = imageDir655 656 docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imageDir657 658 viewerUrl = texttool.get('digiliburlprefix', None)659 if viewerUrl:660 docinfo['viewerURL'] = viewerUrl661 662 # old style text URL663 textUrl = texttool.get('text', None)664 if textUrl:665 if urlparse.urlparse(textUrl)[0] == "": #keine url666 textUrl = os.path.join(archivePath, textUrl)667 # fix URLs starting with /mpiwg/online668 if textUrl.startswith("/mpiwg/online"):669 textUrl = textUrl.replace("/mpiwg/online", '', 1)670 671 docinfo['textURL'] = textUrl672 673 # new style text-url-path674 textUrl = texttool.get('text-url-path', None)675 if textUrl:676 docinfo['textURLPath'] = textUrl677 textUrlkurz = string.split(textUrl, ".")[0]678 docinfo['textURLPathkurz'] = textUrlkurz679 #if not docinfo['imagePath']:680 # text-only, no page images681 #docinfo = self.getNumTextPages(docinfo)682 683 # get bib info684 docinfo = self.getBibinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get info von bib tag685 # TODO: is this needed here?686 docinfo = self.getNameFromIndexMeta(url, docinfo=docinfo, dom=dom)687 688 # TODO: what to do with presentation?689 presentationUrl = texttool.get('presentation', None)690 if presentationUrl: # ueberschreibe diese durch presentation informationen691 # presentation url ergiebt sich ersetzen von index.meta in der url der fuer die Metadaten692 # durch den relativen Pfad auf die presentation infos693 presentationPath = presentationUrl694 if url.endswith("index.meta"):695 presentationUrl = url.replace('index.meta', presentationPath)696 else:697 presentationUrl = url + "/" + presentationPath698 699 docinfo = self.getBibinfoFromTextToolPresentation(presentationUrl, docinfo=docinfo, dom=dom)700 701 # get authorization702 docinfo = self.getAuthinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get access info703 704 return docinfo705 706 707 def getBibinfoFromTextToolPresentation(self,url,docinfo=None,dom=None):708 """gets the bibliographical information from the preseantion entry in texttools709 """710 dom=self.getPresentationInfoXML(url)711 docinfo['author']=getText(dom.find(".//author"))712 docinfo['title']=getText(dom.find(".//title"))713 docinfo['year']=getText(dom.find(".//date"))714 return docinfo715 716 def getDocinfoFromImagePath(self,path,docinfo=None,cut=0):717 """path ist the path to the images it assumes that the index.meta file is one level higher."""718 logging.debug("documentViewer (getdocinfofromimagepath) path: %s"%(path))719 if docinfo is None:720 docinfo = {}721 path=path.replace("/mpiwg/online","")722 docinfo['imagePath'] = path723 docinfo=self.getDirinfoFromDigilib(path,docinfo=docinfo,cut=cut)724 725 pathorig=path726 for x in range(cut):727 path=getParentPath(path)728 logging.debug("documentViewer (getdocinfofromimagepath) PATH:"+path)729 imageUrl=self.digilibBaseUrl+"/servlet/Scaler?fn="+path730 docinfo['imageURL'] = imageUrl731 732 #TODO: use getDocinfoFromIndexMeta733 #path ist the path to the images it assumes that the index.meta file is one level higher.734 docinfo = self.getBibinfoFromIndexMeta(pathorig,docinfo=docinfo,cut=cut+1)735 docinfo = self.getAuthinfoFromIndexMeta(pathorig,docinfo=docinfo,cut=cut+1)736 return docinfo737 738 739 def OLDgetDocinfo(self, mode, url):740 """returns docinfo depending on mode"""741 logging.debug("documentViewer (getdocinfo) mode: %s, url: %s"%(mode,url))742 # look for cached docinfo in session743 if self.REQUEST.SESSION.has_key('docinfo'):744 docinfo = self.REQUEST.SESSION['docinfo']745 # check if its still current746 if docinfo is not None and docinfo.get('mode') == mode and docinfo.get('url') == url:747 logging.debug("documentViewer (getdocinfo) docinfo in session. keys=%s"%docinfo.keys())748 return docinfo749 750 # new docinfo751 docinfo = {'mode': mode, 'url': url}752 # add self url753 docinfo['viewerUrl'] = self.getDocumentViewerURL()754 if mode=="texttool":755 # index.meta with texttool information756 docinfo = self.getDocinfoFromTextTool(url, docinfo=docinfo)757 elif mode=="imagepath":758 # folder with images, index.meta optional759 docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo)760 elif mode=="filepath":761 # filename762 docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo,cut=1)763 else:764 logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode)765 raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode))766 767 logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())768 #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)769 # store in session770 self.REQUEST.SESSION['docinfo'] = docinfo771 return docinfo772 420 773 421 … … 830 478 if bib: 831 479 docinfo = self.getDocinfoFromBib(docinfo, bib) 480 else: 481 # no bib - try info.xml 482 docinfo = self.getDocinfoFromPresentationInfoXml(docinfo) 832 483 833 484 # auth info … … 839 490 if mode != 'texttool': 840 491 # override image path from texttool 841 docinfo['imagePath'] = url 492 docinfo['imagePath'] = url.replace('/mpiwg/online/', '', 1) 842 493 843 494 # number of images from digilib … … 870 521 if not docUrl.startswith('http:'): 871 522 docPath = docUrl 872 523 if docPath: 524 # fix URLs starting with /mpiwg/online 525 docPath = docPath.replace('/mpiwg/online', '', 1) 526 873 527 docinfo['documentPath'] = docPath 874 528 return docinfo … … 890 544 if urlparse.urlparse(textUrl)[0] == "": #keine url 891 545 textUrl = os.path.join(docPath, textUrl) 892 # fix URLs starting with /mpiwg/online893 textUrl = textUrl.replace('/mpiwg/online', '', 1)894 546 895 547 docinfo['textURL'] = textUrl … … 906 558 presentation = texttool.get('presentation', None) 907 559 if presentation and docPath: 908 docinfo['presentationPath'] = os.path.join(docPath, presentation) 560 if presentation.startswith('http:'): 561 docinfo['presentationUrl'] = presentation 562 else: 563 docinfo['presentationUrl'] = os.path.join(docPath, presentation) 909 564 910 565 return docinfo … … 912 567 def getDocinfoFromBib(self, docinfo, bib): 913 568 """reads contents of bib element into docinfo""" 569 logging.debug("getDocinfoFromBib bib=%s"%repr(bib)) 914 570 # put all raw bib fields in dict "bib" 915 571 docinfo['bib'] = bib … … 926 582 """reads contents of access element into docinfo""" 927 583 #TODO: also read resource type 584 logging.debug("getDocinfoFromAccess acc=%s"%repr(acc)) 928 585 try: 929 acctype = acc c['@attr']['type']586 acctype = acc['@attr']['type'] 930 587 if acctype: 931 588 access=acctype … … 960 617 961 618 619 def getDocinfoFromPresentationInfoXml(self,docinfo): 620 """gets DC-like bibliographical information from the presentation entry in texttools""" 621 url = docinfo.get('presentationUrl', None) 622 if not url: 623 logging.error("getDocinfoFromPresentation: no URL!") 624 return docinfo 625 626 dom = None 627 metaUrl = None 628 if url.startswith("http://"): 629 # real URL 630 metaUrl = url 631 else: 632 # online path 633 634 server=self.digilibBaseUrl+"/servlet/Texter?fn=" 635 metaUrl=server+url 636 637 txt=getHttpData(metaUrl) 638 if txt is None: 639 logging.error("Unable to read info.xml from %s"%(url)) 640 return docinfo 641 642 dom = ET.fromstring(txt) 643 docinfo['creator']=getText(dom.find(".//author")) 644 docinfo['title']=getText(dom.find(".//title")) 645 docinfo['date']=getText(dom.find(".//date")) 646 return docinfo 647 648 962 649 def getPageinfo(self, current, start=None, rows=None, cols=None, docinfo=None, viewMode=None, tocMode=None): 963 650 """returns pageinfo with the given parameters"""
Note: See TracChangeset
for help on using the changeset viewer.