diff HocrTxtUtils.py @ 617:7aefbddddaf9

alpaha of hocr server support
author dwinter
date Wed, 23 Jul 2014 17:36:04 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/HocrTxtUtils.py	Wed Jul 23 17:36:04 2014 +0200
@@ -0,0 +1,163 @@
+"""Utility methods for handling XML, reading HTTP, etc"""
+
+from App.ImageFile import ImageFile
+from App.Common import rfc1123_date
+
+import sys
+import os
+import stat
+import urllib
+import urllib2
+import logging
+
+
+HocrTxtUtilsVersion = "0.1"
+
+def getInt(number, default=0):
+    """returns always an int (0 in case of problems)"""
+    try:
+        return int(number)
+    except:
+        return int(default)
+
+def getAt(array, idx, default=None):
+    """returns element idx from array or default (in case of problems)"""
+    try:
+        return array[idx]
+    except:
+        return default
+
+def unicodify(s):
+    """decode str (utf-8 or latin-1 representation) into unicode object"""
+    if not s:
+        return u""
+    if isinstance(s, str):
+        try:
+            return s.decode('utf-8')
+        except:
+            return s.decode('latin-1')
+    else:
+        return unicode(s)
+
+def utf8ify(s):
+    """encode unicode object or string into byte string in utf-8 representation.
+       assumes string objects to be utf-8"""
+    if not s:
+        return ""
+    if isinstance(s, unicode):
+        return s.encode('utf-8')
+    else:
+        return str(s)
+
+def getText(node, recursive=0):
+    """returns all text content of a node and its subnodes"""
+    if node is None:
+        return ''
+    
+    # ElementTree:
+    text = node.text or ''
+    for e in node:
+        if recursive:
+            text += getText(e)
+        else:
+            text += e.text or ''
+        if e.tail:
+            text += e.tail
+
+    # 4Suite:
+    #nodelist=node.childNodes
+    #text = ""
+    #for n in nodelist:
+    #    if n.nodeType == node.TEXT_NODE:
+    #       text = text + n.data
+    
+    return text
+
+
+
+def getHttpData(url, pn=1,data=None, num_tries=3, timeout=10, noExceptions=False):
+    """returns result from url+data HTTP request"""
+    # we do GET (by appending data to url)
+    if isinstance(data, str) or isinstance(data, unicode):
+        # if data is string then append
+        url = "%s?pn=%s&%s"%(url,pn,data)
+    elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple):
+        # urlencode
+        url = "%s?pn=%s&%s"%(url,pn,urllib.urlencode(data))
+    
+    response = None
+    errmsg = None
+    for cnt in range(num_tries):
+        try:
+            logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url))
+            if sys.version_info < (2, 6):
+                # set timeout on socket -- ugly :-(
+                import socket
+                socket.setdefaulttimeout(float(timeout))
+                response = urllib2.urlopen(url)
+            else:
+                # timeout as parameter
+                response = urllib2.urlopen(url,timeout=float(timeout))
+            # check result?
+            break
+        except urllib2.HTTPError, e:
+            logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
+            errmsg = str(e)
+            # stop trying
+            break
+        except urllib2.URLError, e:
+            logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
+            errmsg = str(e)
+            # stop trying
+            #break
+
+    if response is not None:
+        data = response.read()
+        response.close()
+        return data
+    
+    if noExceptions:
+        return None
+    
+    raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg))
+    #return None
+
+
+def refreshingImageFileIndexHtml(self, REQUEST, RESPONSE):
+    """index_html method for App.ImageFile that updates the file info for each request."""
+    stat_info = os.stat(self.path)
+    self.size = stat_info[stat.ST_SIZE]
+    self.lmt = float(stat_info[stat.ST_MTIME]) or time.time()
+    self.lmh = rfc1123_date(self.lmt)
+    # call original method
+    return ImageFile.index_html(self, REQUEST, RESPONSE)
+
+
+def getBrowserType(self):
+    """check the browsers request to find out the browser type"""
+    bt = {}
+    ua = self.REQUEST.get_header("HTTP_USER_AGENT")
+    bt['ua'] = ua
+    bt['isIE'] = False
+    bt['isN4'] = False
+    if string.find(ua, 'MSIE') > -1:
+        bt['isIE'] = True
+    else:
+        bt['isN4'] = (string.find(ua, 'Mozilla/4.') > -1)
+        
+    try:
+        nav = ua[string.find(ua, '('):]
+        ie = string.split(nav, "; ")[1]
+        if string.find(ie, "MSIE") > -1:
+            bt['versIE'] = string.split(ie, " ")[1]
+    except: pass
+    
+    bt['isMac'] = string.find(ua, 'Macintosh') > -1
+    bt['isWin'] = string.find(ua, 'Windows') > -1
+    bt['isIEWin'] = bt['isIE'] and bt['isWin']
+    bt['isIEMac'] = bt['isIE'] and bt['isMac']
+    bt['staticHTML'] = False
+
+    return bt
+
+