Mercurial > hg > documentViewer
diff HocrTxtUtils.py @ 617:7aefbddddaf9
alpaha of hocr server support
author | dwinter |
---|---|
date | Wed, 23 Jul 2014 17:36:04 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HocrTxtUtils.py Wed Jul 23 17:36:04 2014 +0200 @@ -0,0 +1,163 @@ +"""Utility methods for handling XML, reading HTTP, etc""" + +from App.ImageFile import ImageFile +from App.Common import rfc1123_date + +import sys +import os +import stat +import urllib +import urllib2 +import logging + + +HocrTxtUtilsVersion = "0.1" + +def getInt(number, default=0): + """returns always an int (0 in case of problems)""" + try: + return int(number) + except: + return int(default) + +def getAt(array, idx, default=None): + """returns element idx from array or default (in case of problems)""" + try: + return array[idx] + except: + return default + +def unicodify(s): + """decode str (utf-8 or latin-1 representation) into unicode object""" + if not s: + return u"" + if isinstance(s, str): + try: + return s.decode('utf-8') + except: + return s.decode('latin-1') + else: + return unicode(s) + +def utf8ify(s): + """encode unicode object or string into byte string in utf-8 representation. + assumes string objects to be utf-8""" + if not s: + return "" + if isinstance(s, unicode): + return s.encode('utf-8') + else: + return str(s) + +def getText(node, recursive=0): + """returns all text content of a node and its subnodes""" + if node is None: + return '' + + # ElementTree: + text = node.text or '' + for e in node: + if recursive: + text += getText(e) + else: + text += e.text or '' + if e.tail: + text += e.tail + + # 4Suite: + #nodelist=node.childNodes + #text = "" + #for n in nodelist: + # if n.nodeType == node.TEXT_NODE: + # text = text + n.data + + return text + + + +def getHttpData(url, pn=1,data=None, num_tries=3, timeout=10, noExceptions=False): + """returns result from url+data HTTP request""" + # we do GET (by appending data to url) + if isinstance(data, str) or isinstance(data, unicode): + # if data is string then append + url = "%s?pn=%s&%s"%(url,pn,data) + elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple): + # urlencode + url = "%s?pn=%s&%s"%(url,pn,urllib.urlencode(data)) + + response = None + errmsg = None + for cnt in range(num_tries): + try: + logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url)) + if sys.version_info < (2, 6): + # set timeout on socket -- ugly :-( + import socket + socket.setdefaulttimeout(float(timeout)) + response = urllib2.urlopen(url) + else: + # timeout as parameter + response = urllib2.urlopen(url,timeout=float(timeout)) + # check result? + break + except urllib2.HTTPError, e: + logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e)) + errmsg = str(e) + # stop trying + break + except urllib2.URLError, e: + logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e)) + errmsg = str(e) + # stop trying + #break + + if response is not None: + data = response.read() + response.close() + return data + + if noExceptions: + return None + + raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg)) + #return None + + +def refreshingImageFileIndexHtml(self, REQUEST, RESPONSE): + """index_html method for App.ImageFile that updates the file info for each request.""" + stat_info = os.stat(self.path) + self.size = stat_info[stat.ST_SIZE] + self.lmt = float(stat_info[stat.ST_MTIME]) or time.time() + self.lmh = rfc1123_date(self.lmt) + # call original method + return ImageFile.index_html(self, REQUEST, RESPONSE) + + +def getBrowserType(self): + """check the browsers request to find out the browser type""" + bt = {} + ua = self.REQUEST.get_header("HTTP_USER_AGENT") + bt['ua'] = ua + bt['isIE'] = False + bt['isN4'] = False + if string.find(ua, 'MSIE') > -1: + bt['isIE'] = True + else: + bt['isN4'] = (string.find(ua, 'Mozilla/4.') > -1) + + try: + nav = ua[string.find(ua, '('):] + ie = string.split(nav, "; ")[1] + if string.find(ie, "MSIE") > -1: + bt['versIE'] = string.split(ie, " ")[1] + except: pass + + bt['isMac'] = string.find(ua, 'Macintosh') > -1 + bt['isWin'] = string.find(ua, 'Windows') > -1 + bt['isIEWin'] = bt['isIE'] and bt['isWin'] + bt['isIEMac'] = bt['isIE'] and bt['isMac'] + bt['staticHTML'] = False + + return bt + +