Mercurial > hg > documentViewer
comparison HocrTxtUtils.py @ 617:7aefbddddaf9
alpaha of hocr server support
author | dwinter |
---|---|
date | Wed, 23 Jul 2014 17:36:04 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
616:3f9b42840901 | 617:7aefbddddaf9 |
---|---|
1 """Utility methods for handling XML, reading HTTP, etc""" | |
2 | |
3 from App.ImageFile import ImageFile | |
4 from App.Common import rfc1123_date | |
5 | |
6 import sys | |
7 import os | |
8 import stat | |
9 import urllib | |
10 import urllib2 | |
11 import logging | |
12 | |
13 | |
14 HocrTxtUtilsVersion = "0.1" | |
15 | |
16 def getInt(number, default=0): | |
17 """returns always an int (0 in case of problems)""" | |
18 try: | |
19 return int(number) | |
20 except: | |
21 return int(default) | |
22 | |
23 def getAt(array, idx, default=None): | |
24 """returns element idx from array or default (in case of problems)""" | |
25 try: | |
26 return array[idx] | |
27 except: | |
28 return default | |
29 | |
30 def unicodify(s): | |
31 """decode str (utf-8 or latin-1 representation) into unicode object""" | |
32 if not s: | |
33 return u"" | |
34 if isinstance(s, str): | |
35 try: | |
36 return s.decode('utf-8') | |
37 except: | |
38 return s.decode('latin-1') | |
39 else: | |
40 return unicode(s) | |
41 | |
42 def utf8ify(s): | |
43 """encode unicode object or string into byte string in utf-8 representation. | |
44 assumes string objects to be utf-8""" | |
45 if not s: | |
46 return "" | |
47 if isinstance(s, unicode): | |
48 return s.encode('utf-8') | |
49 else: | |
50 return str(s) | |
51 | |
52 def getText(node, recursive=0): | |
53 """returns all text content of a node and its subnodes""" | |
54 if node is None: | |
55 return '' | |
56 | |
57 # ElementTree: | |
58 text = node.text or '' | |
59 for e in node: | |
60 if recursive: | |
61 text += getText(e) | |
62 else: | |
63 text += e.text or '' | |
64 if e.tail: | |
65 text += e.tail | |
66 | |
67 # 4Suite: | |
68 #nodelist=node.childNodes | |
69 #text = "" | |
70 #for n in nodelist: | |
71 # if n.nodeType == node.TEXT_NODE: | |
72 # text = text + n.data | |
73 | |
74 return text | |
75 | |
76 | |
77 | |
78 def getHttpData(url, pn=1,data=None, num_tries=3, timeout=10, noExceptions=False): | |
79 """returns result from url+data HTTP request""" | |
80 # we do GET (by appending data to url) | |
81 if isinstance(data, str) or isinstance(data, unicode): | |
82 # if data is string then append | |
83 url = "%s?pn=%s&%s"%(url,pn,data) | |
84 elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple): | |
85 # urlencode | |
86 url = "%s?pn=%s&%s"%(url,pn,urllib.urlencode(data)) | |
87 | |
88 response = None | |
89 errmsg = None | |
90 for cnt in range(num_tries): | |
91 try: | |
92 logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url)) | |
93 if sys.version_info < (2, 6): | |
94 # set timeout on socket -- ugly :-( | |
95 import socket | |
96 socket.setdefaulttimeout(float(timeout)) | |
97 response = urllib2.urlopen(url) | |
98 else: | |
99 # timeout as parameter | |
100 response = urllib2.urlopen(url,timeout=float(timeout)) | |
101 # check result? | |
102 break | |
103 except urllib2.HTTPError, e: | |
104 logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e)) | |
105 errmsg = str(e) | |
106 # stop trying | |
107 break | |
108 except urllib2.URLError, e: | |
109 logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e)) | |
110 errmsg = str(e) | |
111 # stop trying | |
112 #break | |
113 | |
114 if response is not None: | |
115 data = response.read() | |
116 response.close() | |
117 return data | |
118 | |
119 if noExceptions: | |
120 return None | |
121 | |
122 raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg)) | |
123 #return None | |
124 | |
125 | |
126 def refreshingImageFileIndexHtml(self, REQUEST, RESPONSE): | |
127 """index_html method for App.ImageFile that updates the file info for each request.""" | |
128 stat_info = os.stat(self.path) | |
129 self.size = stat_info[stat.ST_SIZE] | |
130 self.lmt = float(stat_info[stat.ST_MTIME]) or time.time() | |
131 self.lmh = rfc1123_date(self.lmt) | |
132 # call original method | |
133 return ImageFile.index_html(self, REQUEST, RESPONSE) | |
134 | |
135 | |
136 def getBrowserType(self): | |
137 """check the browsers request to find out the browser type""" | |
138 bt = {} | |
139 ua = self.REQUEST.get_header("HTTP_USER_AGENT") | |
140 bt['ua'] = ua | |
141 bt['isIE'] = False | |
142 bt['isN4'] = False | |
143 if string.find(ua, 'MSIE') > -1: | |
144 bt['isIE'] = True | |
145 else: | |
146 bt['isN4'] = (string.find(ua, 'Mozilla/4.') > -1) | |
147 | |
148 try: | |
149 nav = ua[string.find(ua, '('):] | |
150 ie = string.split(nav, "; ")[1] | |
151 if string.find(ie, "MSIE") > -1: | |
152 bt['versIE'] = string.split(ie, " ")[1] | |
153 except: pass | |
154 | |
155 bt['isMac'] = string.find(ua, 'Macintosh') > -1 | |
156 bt['isWin'] = string.find(ua, 'Windows') > -1 | |
157 bt['isIEWin'] = bt['isIE'] and bt['isWin'] | |
158 bt['isIEMac'] = bt['isIE'] and bt['isMac'] | |
159 bt['staticHTML'] = False | |
160 | |
161 return bt | |
162 | |
163 |