documentViewer: HocrTxtUtils.py comparison

comparison HocrTxtUtils.py @ 617:7aefbddddaf9

alpaha of hocr server support

author	dwinter
date	Wed, 23 Jul 2014 17:36:04 +0200
parents
children

comparison

equal deleted inserted replaced

-:3f9b42840901
+:7aefbddddaf9
+"""Utility methods for handling XML, reading HTTP, etc"""
+from App.ImageFile import ImageFile
+from App.Common import rfc1123_date
+import sys
+import os
+import stat
+import urllib
+import urllib2
+import logging
+HocrTxtUtilsVersion = "0.1"
+def getInt(number, default=0):
+"""returns always an int (0 in case of problems)"""
+try:
+return int(number)
+except:
+return int(default)
+def getAt(array, idx, default=None):
+"""returns element idx from array or default (in case of problems)"""
+try:
+return array[idx]
+except:
+return default
+def unicodify(s):
+"""decode str (utf-8 or latin-1 representation) into unicode object"""
+if not s:
+return u""
+if isinstance(s, str):
+try:
+return s.decode('utf-8')
+except:
+return s.decode('latin-1')
+else:
+return unicode(s)
+def utf8ify(s):
+"""encode unicode object or string into byte string in utf-8 representation.
+assumes string objects to be utf-8"""
+if not s:
+return ""
+if isinstance(s, unicode):
+return s.encode('utf-8')
+else:
+return str(s)
+def getText(node, recursive=0):
+"""returns all text content of a node and its subnodes"""
+if node is None:
+return ''
+# ElementTree:
+text = node.text or ''
+for e in node:
+if recursive:
+text += getText(e)
+else:
+text += e.text or ''
+if e.tail:
+text += e.tail
+# 4Suite:
+#nodelist=node.childNodes
+#text = ""
+#for n in nodelist:
+#    if n.nodeType == node.TEXT_NODE:
+#       text = text + n.data
+return text
+def getHttpData(url, pn=1,data=None, num_tries=3, timeout=10, noExceptions=False):
+"""returns result from url+data HTTP request"""
+# we do GET (by appending data to url)
+if isinstance(data, str) or isinstance(data, unicode):
+# if data is string then append
+url = "%s?pn=%s&%s"%(url,pn,data)
+elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple):
+# urlencode
+url = "%s?pn=%s&%s"%(url,pn,urllib.urlencode(data))
+response = None
+errmsg = None
+for cnt in range(num_tries):
+try:
+logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url))
+if sys.version_info < (2, 6):
+# set timeout on socket -- ugly :-(
+import socket
+socket.setdefaulttimeout(float(timeout))
+response = urllib2.urlopen(url)
+else:
+# timeout as parameter
+response = urllib2.urlopen(url,timeout=float(timeout))
+# check result?
+break
+except urllib2.HTTPError, e:
+logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
+errmsg = str(e)
+# stop trying
+break
+except urllib2.URLError, e:
+logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
+errmsg = str(e)
+# stop trying
+#break
+if response is not None:
+data = response.read()
+response.close()
+return data
+if noExceptions:
+return None
+raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg))
+#return None
+def refreshingImageFileIndexHtml(self, REQUEST, RESPONSE):
+"""index_html method for App.ImageFile that updates the file info for each request."""
+stat_info = os.stat(self.path)
+self.size = stat_info[stat.ST_SIZE]
+self.lmt = float(stat_info[stat.ST_MTIME]) or time.time()
+self.lmh = rfc1123_date(self.lmt)
+# call original method
+return ImageFile.index_html(self, REQUEST, RESPONSE)
+def getBrowserType(self):
+"""check the browsers request to find out the browser type"""
+bt = {}
+ua = self.REQUEST.get_header("HTTP_USER_AGENT")
+bt['ua'] = ua
+bt['isIE'] = False
+bt['isN4'] = False
+if string.find(ua, 'MSIE') > -1:
+bt['isIE'] = True
+else:
+bt['isN4'] = (string.find(ua, 'Mozilla/4.') > -1)
+try:
+nav = ua[string.find(ua, '('):]
+ie = string.split(nav, "; ")[1]
+if string.find(ie, "MSIE") > -1:
+bt['versIE'] = string.split(ie, " ")[1]
+except: pass
+bt['isMac'] = string.find(ua, 'Macintosh') > -1
+bt['isWin'] = string.find(ua, 'Windows') > -1
+bt['isIEWin'] = bt['isIE'] and bt['isWin']
+bt['isIEMac'] = bt['isIE'] and bt['isMac']
+bt['staticHTML'] = False
+return bt

Mercurial > hg > documentViewer

comparison HocrTxtUtils.py @ 617:7aefbddddaf9