617
|
1 """Utility methods for handling XML, reading HTTP, etc"""
|
|
2
|
|
3 from App.ImageFile import ImageFile
|
|
4 from App.Common import rfc1123_date
|
|
5
|
|
6 import sys
|
|
7 import os
|
|
8 import stat
|
|
9 import urllib
|
|
10 import urllib2
|
|
11 import logging
|
|
12
|
|
13
|
|
14 HocrTxtUtilsVersion = "0.1"
|
|
15
|
|
16 def getInt(number, default=0):
|
|
17 """returns always an int (0 in case of problems)"""
|
|
18 try:
|
|
19 return int(number)
|
|
20 except:
|
|
21 return int(default)
|
|
22
|
|
23 def getAt(array, idx, default=None):
|
|
24 """returns element idx from array or default (in case of problems)"""
|
|
25 try:
|
|
26 return array[idx]
|
|
27 except:
|
|
28 return default
|
|
29
|
|
30 def unicodify(s):
|
|
31 """decode str (utf-8 or latin-1 representation) into unicode object"""
|
|
32 if not s:
|
|
33 return u""
|
|
34 if isinstance(s, str):
|
|
35 try:
|
|
36 return s.decode('utf-8')
|
|
37 except:
|
|
38 return s.decode('latin-1')
|
|
39 else:
|
|
40 return unicode(s)
|
|
41
|
|
42 def utf8ify(s):
|
|
43 """encode unicode object or string into byte string in utf-8 representation.
|
|
44 assumes string objects to be utf-8"""
|
|
45 if not s:
|
|
46 return ""
|
|
47 if isinstance(s, unicode):
|
|
48 return s.encode('utf-8')
|
|
49 else:
|
|
50 return str(s)
|
|
51
|
|
52 def getText(node, recursive=0):
|
|
53 """returns all text content of a node and its subnodes"""
|
|
54 if node is None:
|
|
55 return ''
|
|
56
|
|
57 # ElementTree:
|
|
58 text = node.text or ''
|
|
59 for e in node:
|
|
60 if recursive:
|
|
61 text += getText(e)
|
|
62 else:
|
|
63 text += e.text or ''
|
|
64 if e.tail:
|
|
65 text += e.tail
|
|
66
|
|
67 # 4Suite:
|
|
68 #nodelist=node.childNodes
|
|
69 #text = ""
|
|
70 #for n in nodelist:
|
|
71 # if n.nodeType == node.TEXT_NODE:
|
|
72 # text = text + n.data
|
|
73
|
|
74 return text
|
|
75
|
|
76
|
|
77
|
|
78 def getHttpData(url, pn=1,data=None, num_tries=3, timeout=10, noExceptions=False):
|
|
79 """returns result from url+data HTTP request"""
|
|
80 # we do GET (by appending data to url)
|
|
81 if isinstance(data, str) or isinstance(data, unicode):
|
|
82 # if data is string then append
|
|
83 url = "%s?pn=%s&%s"%(url,pn,data)
|
|
84 elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple):
|
|
85 # urlencode
|
|
86 url = "%s?pn=%s&%s"%(url,pn,urllib.urlencode(data))
|
|
87
|
|
88 response = None
|
|
89 errmsg = None
|
|
90 for cnt in range(num_tries):
|
|
91 try:
|
|
92 logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url))
|
|
93 if sys.version_info < (2, 6):
|
|
94 # set timeout on socket -- ugly :-(
|
|
95 import socket
|
|
96 socket.setdefaulttimeout(float(timeout))
|
|
97 response = urllib2.urlopen(url)
|
|
98 else:
|
|
99 # timeout as parameter
|
|
100 response = urllib2.urlopen(url,timeout=float(timeout))
|
|
101 # check result?
|
|
102 break
|
|
103 except urllib2.HTTPError, e:
|
|
104 logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
|
|
105 errmsg = str(e)
|
|
106 # stop trying
|
|
107 break
|
|
108 except urllib2.URLError, e:
|
|
109 logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
|
|
110 errmsg = str(e)
|
|
111 # stop trying
|
|
112 #break
|
|
113
|
|
114 if response is not None:
|
|
115 data = response.read()
|
|
116 response.close()
|
|
117 return data
|
|
118
|
|
119 if noExceptions:
|
|
120 return None
|
|
121
|
|
122 raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg))
|
|
123 #return None
|
|
124
|
|
125
|
|
126 def refreshingImageFileIndexHtml(self, REQUEST, RESPONSE):
|
|
127 """index_html method for App.ImageFile that updates the file info for each request."""
|
|
128 stat_info = os.stat(self.path)
|
|
129 self.size = stat_info[stat.ST_SIZE]
|
|
130 self.lmt = float(stat_info[stat.ST_MTIME]) or time.time()
|
|
131 self.lmh = rfc1123_date(self.lmt)
|
|
132 # call original method
|
|
133 return ImageFile.index_html(self, REQUEST, RESPONSE)
|
|
134
|
|
135
|
|
136 def getBrowserType(self):
|
|
137 """check the browsers request to find out the browser type"""
|
|
138 bt = {}
|
|
139 ua = self.REQUEST.get_header("HTTP_USER_AGENT")
|
|
140 bt['ua'] = ua
|
|
141 bt['isIE'] = False
|
|
142 bt['isN4'] = False
|
|
143 if string.find(ua, 'MSIE') > -1:
|
|
144 bt['isIE'] = True
|
|
145 else:
|
|
146 bt['isN4'] = (string.find(ua, 'Mozilla/4.') > -1)
|
|
147
|
|
148 try:
|
|
149 nav = ua[string.find(ua, '('):]
|
|
150 ie = string.split(nav, "; ")[1]
|
|
151 if string.find(ie, "MSIE") > -1:
|
|
152 bt['versIE'] = string.split(ie, " ")[1]
|
|
153 except: pass
|
|
154
|
|
155 bt['isMac'] = string.find(ua, 'Macintosh') > -1
|
|
156 bt['isWin'] = string.find(ua, 'Windows') > -1
|
|
157 bt['isIEWin'] = bt['isIE'] and bt['isWin']
|
|
158 bt['isIEMac'] = bt['isIE'] and bt['isMac']
|
|
159 bt['staticHTML'] = False
|
|
160
|
|
161 return bt
|
|
162
|
|
163
|