1: """Beautiful Soup
2: Elixir and Tonic
3: "The Screen-Scraper's Friend"
4: http://www.crummy.com/software/BeautifulSoup/
5:
6: Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7: tree representation. It provides methods and Pythonic idioms that make
8: it easy to navigate, search, and modify the tree.
9:
10: A well-formed XML/HTML document yields a well-formed data
11: structure. An ill-formed XML/HTML document yields a correspondingly
12: ill-formed data structure. If your document is only locally
13: well-formed, you can use this library to find and process the
14: well-formed part of it.
15:
16: Beautiful Soup works with Python 2.2 and up. It has no external
17: dependencies, but you'll have more success at converting data to UTF-8
18: if you also install these three packages:
19:
20: * chardet, for auto-detecting character encodings
21: http://chardet.feedparser.org/
22: * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23: by stock Python.
24: http://cjkpython.i18n.org/
25:
26: Beautiful Soup defines classes for two main parsing strategies:
27:
28: * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29: language that kind of looks like XML.
30:
31: * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32: or invalid. This class has web browser-like heuristics for
33: obtaining a sensible parse tree in the face of common HTML errors.
34:
35: Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36: the encoding of an HTML or XML document, and converting it to
37: Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38:
39: For more than you ever wanted to know about Beautiful Soup, see the
40: documentation:
41: http://www.crummy.com/software/BeautifulSoup/documentation.html
42:
43: Here, have some legalese:
44:
45: Copyright (c) 2004-2010, Leonard Richardson
46:
47: All rights reserved.
48:
49: Redistribution and use in source and binary forms, with or without
50: modification, are permitted provided that the following conditions are
51: met:
52:
53: * Redistributions of source code must retain the above copyright
54: notice, this list of conditions and the following disclaimer.
55:
56: * Redistributions in binary form must reproduce the above
57: copyright notice, this list of conditions and the following
58: disclaimer in the documentation and/or other materials provided
59: with the distribution.
60:
61: * Neither the name of the the Beautiful Soup Consortium and All
62: Night Kosher Bakery nor the names of its contributors may be
63: used to endorse or promote products derived from this software
64: without specific prior written permission.
65:
66: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73: PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77:
78: """
79: from __future__ import generators
80:
81: __author__ = "Leonard Richardson (leonardr@segfault.org)"
82: __version__ = "3.2.0"
83: __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
84: __license__ = "New-style BSD"
85:
86: from sgmllib import SGMLParser, SGMLParseError
87: import codecs
88: import markupbase
89: import types
90: import re
91: import sgmllib
92: try:
93: from htmlentitydefs import name2codepoint
94: except ImportError:
95: name2codepoint = {}
96: try:
97: set
98: except NameError:
99: from sets import Set as set
100:
101: #These hacks make Beautiful Soup able to parse XML with namespaces
102: sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103: markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104:
105: DEFAULT_OUTPUT_ENCODING = "utf-8"
106:
107: def _match_css_class(str):
108: """Build a RE to match the given CSS class."""
109: return re.compile(r"(^|.*\s)%s($|\s)" % str)
110:
111: # First, the classes that represent markup elements.
112:
113: class PageElement(object):
114: """Contains the navigational information for some part of the page
115: (either a tag or a piece of text)"""
116:
117: def setup(self, parent=None, previous=None):
118: """Sets up the initial relations between this element and
119: other elements."""
120: self.parent = parent
121: self.previous = previous
122: self.next = None
123: self.previousSibling = None
124: self.nextSibling = None
125: if self.parent and self.parent.contents:
126: self.previousSibling = self.parent.contents[-1]
127: self.previousSibling.nextSibling = self
128:
129: def replaceWith(self, replaceWith):
130: oldParent = self.parent
131: myIndex = self.parent.index(self)
132: if hasattr(replaceWith, "parent")\
133: and replaceWith.parent is self.parent:
134: # We're replacing this element with one of its siblings.
135: index = replaceWith.parent.index(replaceWith)
136: if index and index < myIndex:
137: # Furthermore, it comes before this element. That
138: # means that when we extract it, the index of this
139: # element will change.
140: myIndex = myIndex - 1
141: self.extract()
142: oldParent.insert(myIndex, replaceWith)
143:
144: def replaceWithChildren(self):
145: myParent = self.parent
146: myIndex = self.parent.index(self)
147: self.extract()
148: reversedChildren = list(self.contents)
149: reversedChildren.reverse()
150: for child in reversedChildren:
151: myParent.insert(myIndex, child)
152:
153: def extract(self):
154: """Destructively rips this element out of the tree."""
155: if self.parent:
156: try:
157: del self.parent.contents[self.parent.index(self)]
158: except ValueError:
159: pass
160:
161: #Find the two elements that would be next to each other if
162: #this element (and any children) hadn't been parsed. Connect
163: #the two.
164: lastChild = self._lastRecursiveChild()
165: nextElement = lastChild.next
166:
167: if self.previous:
168: self.previous.next = nextElement
169: if nextElement:
170: nextElement.previous = self.previous
171: self.previous = None
172: lastChild.next = None
173:
174: self.parent = None
175: if self.previousSibling:
176: self.previousSibling.nextSibling = self.nextSibling
177: if self.nextSibling:
178: self.nextSibling.previousSibling = self.previousSibling
179: self.previousSibling = self.nextSibling = None
180: return self
181:
182: def _lastRecursiveChild(self):
183: "Finds the last element beneath this object to be parsed."
184: lastChild = self
185: while hasattr(lastChild, 'contents') and lastChild.contents:
186: lastChild = lastChild.contents[-1]
187: return lastChild
188:
189: def insert(self, position, newChild):
190: if isinstance(newChild, basestring) \
191: and not isinstance(newChild, NavigableString):
192: newChild = NavigableString(newChild)
193:
194: position = min(position, len(self.contents))
195: if hasattr(newChild, 'parent') and newChild.parent is not None:
196: # We're 'inserting' an element that's already one
197: # of this object's children.
198: if newChild.parent is self:
199: index = self.index(newChild)
200: if index > position:
201: # Furthermore we're moving it further down the
202: # list of this object's children. That means that
203: # when we extract this element, our target index
204: # will jump down one.
205: position = position - 1
206: newChild.extract()
207:
208: newChild.parent = self
209: previousChild = None
210: if position == 0:
211: newChild.previousSibling = None
212: newChild.previous = self
213: else:
214: previousChild = self.contents[position-1]
215: newChild.previousSibling = previousChild
216: newChild.previousSibling.nextSibling = newChild
217: newChild.previous = previousChild._lastRecursiveChild()
218: if newChild.previous:
219: newChild.previous.next = newChild
220:
221: newChildsLastElement = newChild._lastRecursiveChild()
222:
223: if position >= len(self.contents):
224: newChild.nextSibling = None
225:
226: parent = self
227: parentsNextSibling = None
228: while not parentsNextSibling:
229: parentsNextSibling = parent.nextSibling
230: parent = parent.parent
231: if not parent: # This is the last element in the document.
232: break
233: if parentsNextSibling:
234: newChildsLastElement.next = parentsNextSibling
235: else:
236: newChildsLastElement.next = None
237: else:
238: nextChild = self.contents[position]
239: newChild.nextSibling = nextChild
240: if newChild.nextSibling:
241: newChild.nextSibling.previousSibling = newChild
242: newChildsLastElement.next = nextChild
243:
244: if newChildsLastElement.next:
245: newChildsLastElement.next.previous = newChildsLastElement
246: self.contents.insert(position, newChild)
247:
248: def append(self, tag):
249: """Appends the given tag to the contents of this tag."""
250: self.insert(len(self.contents), tag)
251:
252: def findNext(self, name=None, attrs={}, text=None, **kwargs):
253: """Returns the first item that matches the given criteria and
254: appears after this Tag in the document."""
255: return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
256:
257: def findAllNext(self, name=None, attrs={}, text=None, limit=None,
258: **kwargs):
259: """Returns all items that match the given criteria and appear
260: after this Tag in the document."""
261: return self._findAll(name, attrs, text, limit, self.nextGenerator,
262: **kwargs)
263:
264: def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
265: """Returns the closest sibling to this Tag that matches the
266: given criteria and appears after this Tag in the document."""
267: return self._findOne(self.findNextSiblings, name, attrs, text,
268: **kwargs)
269:
270: def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
271: **kwargs):
272: """Returns the siblings of this Tag that match the given
273: criteria and appear after this Tag in the document."""
274: return self._findAll(name, attrs, text, limit,
275: self.nextSiblingGenerator, **kwargs)
276: fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
277:
278: def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
279: """Returns the first item that matches the given criteria and
280: appears before this Tag in the document."""
281: return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
282:
283: def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
284: **kwargs):
285: """Returns all items that match the given criteria and appear
286: before this Tag in the document."""
287: return self._findAll(name, attrs, text, limit, self.previousGenerator,
288: **kwargs)
289: fetchPrevious = findAllPrevious # Compatibility with pre-3.x
290:
291: def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
292: """Returns the closest sibling to this Tag that matches the
293: given criteria and appears before this Tag in the document."""
294: return self._findOne(self.findPreviousSiblings, name, attrs, text,
295: **kwargs)
296:
297: def findPreviousSiblings(self, name=None, attrs={}, text=None,
298: limit=None, **kwargs):
299: """Returns the siblings of this Tag that match the given
300: criteria and appear before this Tag in the document."""
301: return self._findAll(name, attrs, text, limit,
302: self.previousSiblingGenerator, **kwargs)
303: fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
304:
305: def findParent(self, name=None, attrs={}, **kwargs):
306: """Returns the closest parent of this Tag that matches the given
307: criteria."""
308: # NOTE: We can't use _findOne because findParents takes a different
309: # set of arguments.
310: r = None
311: l = self.findParents(name, attrs, 1)
312: if l:
313: r = l[0]
314: return r
315:
316: def findParents(self, name=None, attrs={}, limit=None, **kwargs):
317: """Returns the parents of this Tag that match the given
318: criteria."""
319:
320: return self._findAll(name, attrs, None, limit, self.parentGenerator,
321: **kwargs)
322: fetchParents = findParents # Compatibility with pre-3.x
323:
324: #These methods do the real heavy lifting.
325:
326: def _findOne(self, method, name, attrs, text, **kwargs):
327: r = None
328: l = method(name, attrs, text, 1, **kwargs)
329: if l:
330: r = l[0]
331: return r
332:
333: def _findAll(self, name, attrs, text, limit, generator, **kwargs):
334: "Iterates over a generator looking for things that match."
335:
336: if isinstance(name, SoupStrainer):
337: strainer = name
338: # (Possibly) special case some findAll*(...) searches
339: elif text is None and not limit and not attrs and not kwargs:
340: # findAll*(True)
341: if name is True:
342: return [element for element in generator()
343: if isinstance(element, Tag)]
344: # findAll*('tag-name')
345: elif isinstance(name, basestring):
346: return [element for element in generator()
347: if isinstance(element, Tag) and
348: element.name == name]
349: else:
350: strainer = SoupStrainer(name, attrs, text, **kwargs)
351: # Build a SoupStrainer
352: else:
353: strainer = SoupStrainer(name, attrs, text, **kwargs)
354: results = ResultSet(strainer)
355: g = generator()
356: while True:
357: try:
358: i = g.next()
359: except StopIteration:
360: break
361: if i:
362: found = strainer.search(i)
363: if found:
364: results.append(found)
365: if limit and len(results) >= limit:
366: break
367: return results
368:
369: #These Generators can be used to navigate starting from both
370: #NavigableStrings and Tags.
371: def nextGenerator(self):
372: i = self
373: while i is not None:
374: i = i.next
375: yield i
376:
377: def nextSiblingGenerator(self):
378: i = self
379: while i is not None:
380: i = i.nextSibling
381: yield i
382:
383: def previousGenerator(self):
384: i = self
385: while i is not None:
386: i = i.previous
387: yield i
388:
389: def previousSiblingGenerator(self):
390: i = self
391: while i is not None:
392: i = i.previousSibling
393: yield i
394:
395: def parentGenerator(self):
396: i = self
397: while i is not None:
398: i = i.parent
399: yield i
400:
401: # Utility methods
402: def substituteEncoding(self, str, encoding=None):
403: encoding = encoding or "utf-8"
404: return str.replace("%SOUP-ENCODING%", encoding)
405:
406: def toEncoding(self, s, encoding=None):
407: """Encodes an object to a string in some encoding, or to Unicode.
408: ."""
409: if isinstance(s, unicode):
410: if encoding:
411: s = s.encode(encoding)
412: elif isinstance(s, str):
413: if encoding:
414: s = s.encode(encoding)
415: else:
416: s = unicode(s)
417: else:
418: if encoding:
419: s = self.toEncoding(str(s), encoding)
420: else:
421: s = unicode(s)
422: return s
423:
424: class NavigableString(unicode, PageElement):
425:
426: def __new__(cls, value):
427: """Create a new NavigableString.
428:
429: When unpickling a NavigableString, this method is called with
430: the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
431: passed in to the superclass's __new__ or the superclass won't know
432: how to handle non-ASCII characters.
433: """
434: if isinstance(value, unicode):
435: return unicode.__new__(cls, value)
436: return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
437:
438: def __getnewargs__(self):
439: return (NavigableString.__str__(self),)
440:
441: def __getattr__(self, attr):
442: """text.string gives you text. This is for backwards
443: compatibility for Navigable*String, but for CData* it lets you
444: get the string without the CData wrapper."""
445: if attr == 'string':
446: return self
447: else:
448: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
449:
450: def __unicode__(self):
451: return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452:
453: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
454: if encoding:
455: return self.encode(encoding)
456: else:
457: return self
458:
459: class CData(NavigableString):
460:
461: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
462: return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
463:
464: class ProcessingInstruction(NavigableString):
465: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
466: output = self
467: if "%SOUP-ENCODING%" in output:
468: output = self.substituteEncoding(output, encoding)
469: return "<?%s?>" % self.toEncoding(output, encoding)
470:
471: class Comment(NavigableString):
472: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
473: return "<!--%s-->" % NavigableString.__str__(self, encoding)
474:
475: class Declaration(NavigableString):
476: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
477: return "<!%s>" % NavigableString.__str__(self, encoding)
478:
479: class Tag(PageElement):
480:
481: """Represents a found HTML tag with its attributes and contents."""
482:
483: def _invert(h):
484: "Cheap function to invert a hash."
485: i = {}
486: for k,v in h.items():
487: i[v] = k
488: return i
489:
490: XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
491: "quot" : '"',
492: "amp" : "&",
493: "lt" : "<",
494: "gt" : ">" }
495:
496: XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497:
498: def _convertEntities(self, match):
499: """Used in a call to re.sub to replace HTML, XML, and numeric
500: entities with the appropriate Unicode characters. If HTML
501: entities are being converted, any unrecognized entities are
502: escaped."""
503: x = match.group(1)
504: if self.convertHTMLEntities and x in name2codepoint:
505: return unichr(name2codepoint[x])
506: elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
507: if self.convertXMLEntities:
508: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
509: else:
510: return u'&%s;' % x
511: elif len(x) > 0 and x[0] == '#':
512: # Handle numeric entities
513: if len(x) > 1 and x[1] == 'x':
514: return unichr(int(x[2:], 16))
515: else:
516: return unichr(int(x[1:]))
517:
518: elif self.escapeUnrecognizedEntities:
519: return u'&%s;' % x
520: else:
521: return u'&%s;' % x
522:
523: def __init__(self, parser, name, attrs=None, parent=None,
524: previous=None):
525: "Basic constructor."
526:
527: # We don't actually store the parser object: that lets extracted
528: # chunks be garbage-collected
529: self.parserClass = parser.__class__
530: self.isSelfClosing = parser.isSelfClosingTag(name)
531: self.name = name
532: if attrs is None:
533: attrs = []
534: elif isinstance(attrs, dict):
535: attrs = attrs.items()
536: self.attrs = attrs
537: self.contents = []
538: self.setup(parent, previous)
539: self.hidden = False
540: self.containsSubstitutions = False
541: self.convertHTMLEntities = parser.convertHTMLEntities
542: self.convertXMLEntities = parser.convertXMLEntities
543: self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
544:
545: # Convert any HTML, XML, or numeric entities in the attribute values.
546: convert = lambda(k, val): (k,
547: re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
548: self._convertEntities,
549: val))
550: self.attrs = map(convert, self.attrs)
551:
552: def getString(self):
553: if (len(self.contents) == 1
554: and isinstance(self.contents[0], NavigableString)):
555: return self.contents[0]
556:
557: def setString(self, string):
558: """Replace the contents of the tag with a string"""
559: self.clear()
560: self.append(string)
561:
562: string = property(getString, setString)
563:
564: def getText(self, separator=u""):
565: if not len(self.contents):
566: return u""
567: stopNode = self._lastRecursiveChild().next
568: strings = []
569: current = self.contents[0]
570: while current is not stopNode:
571: if isinstance(current, NavigableString):
572: strings.append(current.strip())
573: current = current.next
574: return separator.join(strings)
575:
576: text = property(getText)
577:
578: def get(self, key, default=None):
579: """Returns the value of the 'key' attribute for the tag, or
580: the value given for 'default' if it doesn't have that
581: attribute."""
582: return self._getAttrMap().get(key, default)
583:
584: def clear(self):
585: """Extract all children."""
586: for child in self.contents[:]:
587: child.extract()
588:
589: def index(self, element):
590: for i, child in enumerate(self.contents):
591: if child is element:
592: return i
593: raise ValueError("Tag.index: element not in tag")
594:
595: def has_key(self, key):
596: return self._getAttrMap().has_key(key)
597:
598: def __getitem__(self, key):
599: """tag[key] returns the value of the 'key' attribute for the tag,
600: and throws an exception if it's not there."""
601: return self._getAttrMap()[key]
602:
603: def __iter__(self):
604: "Iterating over a tag iterates over its contents."
605: return iter(self.contents)
606:
607: def __len__(self):
608: "The length of a tag is the length of its list of contents."
609: return len(self.contents)
610:
611: def __contains__(self, x):
612: return x in self.contents
613:
614: def __nonzero__(self):
615: "A tag is non-None even if it has no contents."
616: return True
617:
618: def __setitem__(self, key, value):
619: """Setting tag[key] sets the value of the 'key' attribute for the
620: tag."""
621: self._getAttrMap()
622: self.attrMap[key] = value
623: found = False
624: for i in range(0, len(self.attrs)):
625: if self.attrs[i][0] == key:
626: self.attrs[i] = (key, value)
627: found = True
628: if not found:
629: self.attrs.append((key, value))
630: self._getAttrMap()[key] = value
631:
632: def __delitem__(self, key):
633: "Deleting tag[key] deletes all 'key' attributes for the tag."
634: for item in self.attrs:
635: if item[0] == key:
636: self.attrs.remove(item)
637: #We don't break because bad HTML can define the same
638: #attribute multiple times.
639: self._getAttrMap()
640: if self.attrMap.has_key(key):
641: del self.attrMap[key]
642:
643: def __call__(self, *args, **kwargs):
644: """Calling a tag like a function is the same as calling its
645: findAll() method. Eg. tag('a') returns a list of all the A tags
646: found within this tag."""
647: return apply(self.findAll, args, kwargs)
648:
649: def __getattr__(self, tag):
650: #print "Getattr %s.%s" % (self.__class__, tag)
651: if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
652: return self.find(tag[:-3])
653: elif tag.find('__') != 0:
654: return self.find(tag)
655: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
656:
657: def __eq__(self, other):
658: """Returns true iff this tag has the same name, the same attributes,
659: and the same contents (recursively) as the given tag.
660:
661: NOTE: right now this will return false if two tags have the
662: same attributes in a different order. Should this be fixed?"""
663: if other is self:
664: return True
665: if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
666: return False
667: for i in range(0, len(self.contents)):
668: if self.contents[i] != other.contents[i]:
669: return False
670: return True
671:
672: def __ne__(self, other):
673: """Returns true iff this tag is not identical to the other tag,
674: as defined in __eq__."""
675: return not self == other
676:
677: def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
678: """Renders this tag as a string."""
679: return self.__str__(encoding)
680:
681: def __unicode__(self):
682: return self.__str__(None)
683:
684: BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685: + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686: + ")")
687:
688: def _sub_entity(self, x):
689: """Used with a regular expression to substitute the
690: appropriate XML entity for an XML special character."""
691: return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
692:
693: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694: prettyPrint=False, indentLevel=0):
695: """Returns a string or Unicode representation of this tag and
696: its contents. To get Unicode, pass None for encoding.
697:
698: NOTE: since Python's HTML parser consumes whitespace, this
699: method is not certain to reproduce the whitespace present in
700: the original string."""
701:
702: encodedName = self.toEncoding(self.name, encoding)
703:
704: attrs = []
705: if self.attrs:
706: for key, val in self.attrs:
707: fmt = '%s="%s"'
708: if isinstance(val, basestring):
709: if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
710: val = self.substituteEncoding(val, encoding)
711:
712: # The attribute value either:
713: #
714: # * Contains no embedded double quotes or single quotes.
715: # No problem: we enclose it in double quotes.
716: # * Contains embedded single quotes. No problem:
717: # double quotes work here too.
718: # * Contains embedded double quotes. No problem:
719: # we enclose it in single quotes.
720: # * Embeds both single _and_ double quotes. This
721: # can't happen naturally, but it can happen if
722: # you modify an attribute value after parsing
723: # the document. Now we have a bit of a
724: # problem. We solve it by enclosing the
725: # attribute in single quotes, and escaping any
726: # embedded single quotes to XML entities.
727: if '"' in val:
728: fmt = "%s='%s'"
729: if "'" in val:
730: # TODO: replace with apos when
731: # appropriate.
732: val = val.replace("'", "&squot;")
733:
734: # Now we're okay w/r/t quotes. But the attribute
735: # value might also contain angle brackets, or
736: # ampersands that aren't part of entities. We need
737: # to escape those to XML entities too.
738: val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
739:
740: attrs.append(fmt % (self.toEncoding(key, encoding),
741: self.toEncoding(val, encoding)))
742: close = ''
743: closeTag = ''
744: if self.isSelfClosing:
745: close = ' /'
746: else:
747: closeTag = '</%s>' % encodedName
748:
749: indentTag, indentContents = 0, 0
750: if prettyPrint:
751: indentTag = indentLevel
752: space = (' ' * (indentTag-1))
753: indentContents = indentTag + 1
754: contents = self.renderContents(encoding, prettyPrint, indentContents)
755: if self.hidden:
756: s = contents
757: else:
758: s = []
759: attributeString = ''
760: if attrs:
761: attributeString = ' ' + ' '.join(attrs)
762: if prettyPrint:
763: s.append(space)
764: s.append('<%s%s%s>' % (encodedName, attributeString, close))
765: if prettyPrint:
766: s.append("\n")
767: s.append(contents)
768: if prettyPrint and contents and contents[-1] != "\n":
769: s.append("\n")
770: if prettyPrint and closeTag:
771: s.append(space)
772: s.append(closeTag)
773: if prettyPrint and closeTag and self.nextSibling:
774: s.append("\n")
775: s = ''.join(s)
776: return s
777:
778: def decompose(self):
779: """Recursively destroys the contents of this tree."""
780: self.extract()
781: if len(self.contents) == 0:
782: return
783: current = self.contents[0]
784: while current is not None:
785: next = current.next
786: if isinstance(current, Tag):
787: del current.contents[:]
788: current.parent = None
789: current.previous = None
790: current.previousSibling = None
791: current.next = None
792: current.nextSibling = None
793: current = next
794:
795: def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
796: return self.__str__(encoding, True)
797:
798: def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
799: prettyPrint=False, indentLevel=0):
800: """Renders the contents of this tag as a string in the given
801: encoding. If encoding is None, returns a Unicode string.."""
802: s=[]
803: for c in self:
804: text = None
805: if isinstance(c, NavigableString):
806: text = c.__str__(encoding)
807: elif isinstance(c, Tag):
808: s.append(c.__str__(encoding, prettyPrint, indentLevel))
809: if text and prettyPrint:
810: text = text.strip()
811: if text:
812: if prettyPrint:
813: s.append(" " * (indentLevel-1))
814: s.append(text)
815: if prettyPrint:
816: s.append("\n")
817: return ''.join(s)
818:
819: #Soup methods
820:
821: def find(self, name=None, attrs={}, recursive=True, text=None,
822: **kwargs):
823: """Return only the first child of this Tag matching the given
824: criteria."""
825: r = None
826: l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
827: if l:
828: r = l[0]
829: return r
830: findChild = find
831:
832: def findAll(self, name=None, attrs={}, recursive=True, text=None,
833: limit=None, **kwargs):
834: """Extracts a list of Tag objects that match the given
835: criteria. You can specify the name of the Tag and any
836: attributes you want the Tag to have.
837:
838: The value of a key-value pair in the 'attrs' map can be a
839: string, a list of strings, a regular expression object, or a
840: callable that takes a string and returns whether or not the
841: string matches for some custom definition of 'matches'. The
842: same is true of the tag name."""
843: generator = self.recursiveChildGenerator
844: if not recursive:
845: generator = self.childGenerator
846: return self._findAll(name, attrs, text, limit, generator, **kwargs)
847: findChildren = findAll
848:
849: # Pre-3.x compatibility methods
850: first = find
851: fetch = findAll
852:
853: def fetchText(self, text=None, recursive=True, limit=None):
854: return self.findAll(text=text, recursive=recursive, limit=limit)
855:
856: def firstText(self, text=None, recursive=True):
857: return self.find(text=text, recursive=recursive)
858:
859: #Private methods
860:
861: def _getAttrMap(self):
862: """Initializes a map representation of this tag's attributes,
863: if not already initialized."""
864: if not getattr(self, 'attrMap'):
865: self.attrMap = {}
866: for (key, value) in self.attrs:
867: self.attrMap[key] = value
868: return self.attrMap
869:
870: #Generator methods
871: def childGenerator(self):
872: # Just use the iterator from the contents
873: return iter(self.contents)
874:
875: def recursiveChildGenerator(self):
876: if not len(self.contents):
877: raise StopIteration
878: stopNode = self._lastRecursiveChild().next
879: current = self.contents[0]
880: while current is not stopNode:
881: yield current
882: current = current.next
883:
884:
885: # Next, a couple classes to represent queries and their results.
886: class SoupStrainer:
887: """Encapsulates a number of ways of matching a markup element (tag or
888: text)."""
889:
890: def __init__(self, name=None, attrs={}, text=None, **kwargs):
891: self.name = name
892: if isinstance(attrs, basestring):
893: kwargs['class'] = _match_css_class(attrs)
894: attrs = None
895: if kwargs:
896: if attrs:
897: attrs = attrs.copy()
898: attrs.update(kwargs)
899: else:
900: attrs = kwargs
901: self.attrs = attrs
902: self.text = text
903:
904: def __str__(self):
905: if self.text:
906: return self.text
907: else:
908: return "%s|%s" % (self.name, self.attrs)
909:
910: def searchTag(self, markupName=None, markupAttrs={}):
911: found = None
912: markup = None
913: if isinstance(markupName, Tag):
914: markup = markupName
915: markupAttrs = markup
916: callFunctionWithTagData = callable(self.name) \
917: and not isinstance(markupName, Tag)
918:
919: if (not self.name) \
920: or callFunctionWithTagData \
921: or (markup and self._matches(markup, self.name)) \
922: or (not markup and self._matches(markupName, self.name)):
923: if callFunctionWithTagData:
924: match = self.name(markupName, markupAttrs)
925: else:
926: match = True
927: markupAttrMap = None
928: for attr, matchAgainst in self.attrs.items():
929: if not markupAttrMap:
930: if hasattr(markupAttrs, 'get'):
931: markupAttrMap = markupAttrs
932: else:
933: markupAttrMap = {}
934: for k,v in markupAttrs:
935: markupAttrMap[k] = v
936: attrValue = markupAttrMap.get(attr)
937: if not self._matches(attrValue, matchAgainst):
938: match = False
939: break
940: if match:
941: if markup:
942: found = markup
943: else:
944: found = markupName
945: return found
946:
947: def search(self, markup):
948: #print 'looking for %s in %s' % (self, markup)
949: found = None
950: # If given a list of items, scan it for a text element that
951: # matches.
952: if hasattr(markup, "__iter__") \
953: and not isinstance(markup, Tag):
954: for element in markup:
955: if isinstance(element, NavigableString) \
956: and self.search(element):
957: found = element
958: break
959: # If it's a Tag, make sure its name or attributes match.
960: # Don't bother with Tags if we're searching for text.
961: elif isinstance(markup, Tag):
962: if not self.text:
963: found = self.searchTag(markup)
964: # If it's text, make sure the text matches.
965: elif isinstance(markup, NavigableString) or \
966: isinstance(markup, basestring):
967: if self._matches(markup, self.text):
968: found = markup
969: else:
970: raise Exception, "I don't know how to match against a %s" \
971: % markup.__class__
972: return found
973:
974: def _matches(self, markup, matchAgainst):
975: #print "Matching %s against %s" % (markup, matchAgainst)
976: result = False
977: if matchAgainst is True:
978: result = markup is not None
979: elif callable(matchAgainst):
980: result = matchAgainst(markup)
981: else:
982: #Custom match methods take the tag as an argument, but all
983: #other ways of matching match the tag name as a string.
984: if isinstance(markup, Tag):
985: markup = markup.name
986: if markup and not isinstance(markup, basestring):
987: markup = unicode(markup)
988: #Now we know that chunk is either a string, or None.
989: if hasattr(matchAgainst, 'match'):
990: # It's a regexp object.
991: result = markup and matchAgainst.search(markup)
992: elif hasattr(matchAgainst, '__iter__'): # list-like
993: result = markup in matchAgainst
994: elif hasattr(matchAgainst, 'items'):
995: result = markup.has_key(matchAgainst)
996: elif matchAgainst and isinstance(markup, basestring):
997: if isinstance(markup, unicode):
998: matchAgainst = unicode(matchAgainst)
999: else:
1000: matchAgainst = str(matchAgainst)
1001:
1002: if not result:
1003: result = matchAgainst == markup
1004: return result
1005:
1006: class ResultSet(list):
1007: """A ResultSet is just a list that keeps track of the SoupStrainer
1008: that created it."""
1009: def __init__(self, source):
1010: list.__init__([])
1011: self.source = source
1012:
1013: # Now, some helper functions.
1014:
1015: def buildTagMap(default, *args):
1016: """Turns a list of maps, lists, or scalars into a single map.
1017: Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018: NESTING_RESET_TAGS maps out of lists and partial maps."""
1019: built = {}
1020: for portion in args:
1021: if hasattr(portion, 'items'):
1022: #It's a map. Merge it.
1023: for k,v in portion.items():
1024: built[k] = v
1025: elif hasattr(portion, '__iter__'): # is a list
1026: #It's a list. Map each item to the default.
1027: for k in portion:
1028: built[k] = default
1029: else:
1030: #It's a scalar. Map it to the default.
1031: built[portion] = default
1032: return built
1033:
1034: # Now, the parser classes.
1035:
1036: class BeautifulStoneSoup(Tag, SGMLParser):
1037:
1038: """This class contains the basic parser and search code. It defines
1039: a parser that knows nothing about tag behavior except for the
1040: following:
1041:
1042: You can't close a tag without closing all the tags it encloses.
1043: That is, "<foo><bar></foo>" actually means
1044: "<foo><bar></bar></foo>".
1045:
1046: [Another possible explanation is "<foo><bar /></foo>", but since
1047: this class defines no SELF_CLOSING_TAGS, it will never use that
1048: explanation.]
1049:
1050: This class is useful for parsing XML or made-up markup languages,
1051: or when BeautifulSoup makes an assumption counter to what you were
1052: expecting."""
1053:
1054: SELF_CLOSING_TAGS = {}
1055: NESTABLE_TAGS = {}
1056: RESET_NESTING_TAGS = {}
1057: QUOTE_TAGS = {}
1058: PRESERVE_WHITESPACE_TAGS = []
1059:
1060: MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061: lambda x: x.group(1) + ' />'),
1062: (re.compile('<!\s+([^<>]*)>'),
1063: lambda x: '<!' + x.group(1) + '>')
1064: ]
1065:
1066: ROOT_TAG_NAME = u'[document]'
1067:
1068: HTML_ENTITIES = "html"
1069: XML_ENTITIES = "xml"
1070: XHTML_ENTITIES = "xhtml"
1071: # TODO: This only exists for backwards-compatibility
1072: ALL_ENTITIES = XHTML_ENTITIES
1073:
1074: # Used when determining whether a text node is all whitespace and
1075: # can be replaced with a single space. A text node that contains
1076: # fancy Unicode spaces (usually non-breaking) should be left
1077: # alone.
1078: STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1079:
1080: def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081: markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082: convertEntities=None, selfClosingTags=None, isHTML=False):
1083: """The Soup object is initialized as the 'root tag', and the
1084: provided markup (which can be a string or a file-like object)
1085: is fed into the underlying parser.
1086:
1087: sgmllib will process most bad HTML, and the BeautifulSoup
1088: class has some tricks for dealing with some HTML that kills
1089: sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090: if your data uses self-closing tags or declarations
1091: incorrectly.
1092:
1093: By default, Beautiful Soup uses regexes to sanitize input,
1094: avoiding the vast majority of these problems. If the problems
1095: don't apply to you, pass in False for markupMassage, and
1096: you'll get better performance.
1097:
1098: The default parser massage techniques fix the two most common
1099: instances of invalid HTML that choke sgmllib:
1100:
1101: <br/> (No space between name of closing tag and tag close)
1102: <! --Comment--> (Extraneous whitespace in declaration)
1103:
1104: You can pass in a custom list of (RE object, replace method)
1105: tuples to get Beautiful Soup to scrub your input the way you
1106: want."""
1107:
1108: self.parseOnlyThese = parseOnlyThese
1109: self.fromEncoding = fromEncoding
1110: self.smartQuotesTo = smartQuotesTo
1111: self.convertEntities = convertEntities
1112: # Set the rules for how we'll deal with the entities we
1113: # encounter
1114: if self.convertEntities:
1115: # It doesn't make sense to convert encoded characters to
1116: # entities even while you're converting entities to Unicode.
1117: # Just convert it all to Unicode.
1118: self.smartQuotesTo = None
1119: if convertEntities == self.HTML_ENTITIES:
1120: self.convertXMLEntities = False
1121: self.convertHTMLEntities = True
1122: self.escapeUnrecognizedEntities = True
1123: elif convertEntities == self.XHTML_ENTITIES:
1124: self.convertXMLEntities = True
1125: self.convertHTMLEntities = True
1126: self.escapeUnrecognizedEntities = False
1127: elif convertEntities == self.XML_ENTITIES:
1128: self.convertXMLEntities = True
1129: self.convertHTMLEntities = False
1130: self.escapeUnrecognizedEntities = False
1131: else:
1132: self.convertXMLEntities = False
1133: self.convertHTMLEntities = False
1134: self.escapeUnrecognizedEntities = False
1135:
1136: self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137: SGMLParser.__init__(self)
1138:
1139: if hasattr(markup, 'read'): # It's a file-type object.
1140: markup = markup.read()
1141: self.markup = markup
1142: self.markupMassage = markupMassage
1143: try:
1144: self._feed(isHTML=isHTML)
1145: except StopParsing:
1146: pass
1147: self.markup = None # The markup can now be GCed
1148:
1149: def convert_charref(self, name):
1150: """This method fixes a bug in Python's SGMLParser."""
1151: try:
1152: n = int(name)
1153: except ValueError:
1154: return
1155: if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156: return
1157: return self.convert_codepoint(n)
1158:
1159: def _feed(self, inDocumentEncoding=None, isHTML=False):
1160: # Convert the document to Unicode.
1161: markup = self.markup
1162: if isinstance(markup, unicode):
1163: if not hasattr(self, 'originalEncoding'):
1164: self.originalEncoding = None
1165: else:
1166: dammit = UnicodeDammit\
1167: (markup, [self.fromEncoding, inDocumentEncoding],
1168: smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169: markup = dammit.unicode
1170: self.originalEncoding = dammit.originalEncoding
1171: self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172: if markup:
1173: if self.markupMassage:
1174: if not hasattr(self.markupMassage, "__iter__"):
1175: self.markupMassage = self.MARKUP_MASSAGE
1176: for fix, m in self.markupMassage:
1177: markup = fix.sub(m, markup)
1178: # TODO: We get rid of markupMassage so that the
1179: # soup object can be deepcopied later on. Some
1180: # Python installations can't copy regexes. If anyone
1181: # was relying on the existence of markupMassage, this
1182: # might cause problems.
1183: del(self.markupMassage)
1184: self.reset()
1185:
1186: SGMLParser.feed(self, markup)
1187: # Close out any unfinished strings and close all the open tags.
1188: self.endData()
1189: while self.currentTag.name != self.ROOT_TAG_NAME:
1190: self.popTag()
1191:
1192: def __getattr__(self, methodName):
1193: """This method routes method call requests to either the SGMLParser
1194: superclass or the Tag superclass, depending on the method name."""
1195: #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196:
1197: if methodName.startswith('start_') or methodName.startswith('end_') \
1198: or methodName.startswith('do_'):
1199: return SGMLParser.__getattr__(self, methodName)
1200: elif not methodName.startswith('__'):
1201: return Tag.__getattr__(self, methodName)
1202: else:
1203: raise AttributeError
1204:
1205: def isSelfClosingTag(self, name):
1206: """Returns true iff the given string is the name of a
1207: self-closing tag according to this parser."""
1208: return self.SELF_CLOSING_TAGS.has_key(name) \
1209: or self.instanceSelfClosingTags.has_key(name)
1210:
1211: def reset(self):
1212: Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213: self.hidden = 1
1214: SGMLParser.reset(self)
1215: self.currentData = []
1216: self.currentTag = None
1217: self.tagStack = []
1218: self.quoteStack = []
1219: self.pushTag(self)
1220:
1221: def popTag(self):
1222: tag = self.tagStack.pop()
1223:
1224: #print "Pop", tag.name
1225: if self.tagStack:
1226: self.currentTag = self.tagStack[-1]
1227: return self.currentTag
1228:
1229: def pushTag(self, tag):
1230: #print "Push", tag.name
1231: if self.currentTag:
1232: self.currentTag.contents.append(tag)
1233: self.tagStack.append(tag)
1234: self.currentTag = self.tagStack[-1]
1235:
1236: def endData(self, containerClass=NavigableString):
1237: if self.currentData:
1238: currentData = u''.join(self.currentData)
1239: if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240: not set([tag.name for tag in self.tagStack]).intersection(
1241: self.PRESERVE_WHITESPACE_TAGS)):
1242: if '\n' in currentData:
1243: currentData = '\n'
1244: else:
1245: currentData = ' '
1246: self.currentData = []
1247: if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248: (not self.parseOnlyThese.text or \
1249: not self.parseOnlyThese.search(currentData)):
1250: return
1251: o = containerClass(currentData)
1252: o.setup(self.currentTag, self.previous)
1253: if self.previous:
1254: self.previous.next = o
1255: self.previous = o
1256: self.currentTag.contents.append(o)
1257:
1258:
1259: def _popToTag(self, name, inclusivePop=True):
1260: """Pops the tag stack up to and including the most recent
1261: instance of the given tag. If inclusivePop is false, pops the tag
1262: stack up to but *not* including the most recent instqance of
1263: the given tag."""
1264: #print "Popping to %s" % name
1265: if name == self.ROOT_TAG_NAME:
1266: return
1267:
1268: numPops = 0
1269: mostRecentTag = None
1270: for i in range(len(self.tagStack)-1, 0, -1):
1271: if name == self.tagStack[i].name:
1272: numPops = len(self.tagStack)-i
1273: break
1274: if not inclusivePop:
1275: numPops = numPops - 1
1276:
1277: for i in range(0, numPops):
1278: mostRecentTag = self.popTag()
1279: return mostRecentTag
1280:
1281: def _smartPop(self, name):
1282:
1283: """We need to pop up to the previous tag of this type, unless
1284: one of this tag's nesting reset triggers comes between this
1285: tag and the previous tag of this type, OR unless this tag is a
1286: generic nesting trigger and another generic nesting trigger
1287: comes between this tag and the previous tag of this type.
1288:
1289: Examples:
1290: <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291: <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292: <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1293:
1294: <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295: <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296: <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1297: """
1298:
1299: nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300: isNestable = nestingResetTriggers != None
1301: isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302: popTo = None
1303: inclusive = True
1304: for i in range(len(self.tagStack)-1, 0, -1):
1305: p = self.tagStack[i]
1306: if (not p or p.name == name) and not isNestable:
1307: #Non-nestable tags get popped to the top or to their
1308: #last occurance.
1309: popTo = name
1310: break
1311: if (nestingResetTriggers is not None
1312: and p.name in nestingResetTriggers) \
1313: or (nestingResetTriggers is None and isResetNesting
1314: and self.RESET_NESTING_TAGS.has_key(p.name)):
1315:
1316: #If we encounter one of the nesting reset triggers
1317: #peculiar to this tag, or we encounter another tag
1318: #that causes nesting to reset, pop up to but not
1319: #including that tag.
1320: popTo = p.name
1321: inclusive = False
1322: break
1323: p = p.parent
1324: if popTo:
1325: self._popToTag(popTo, inclusive)
1326:
1327: def unknown_starttag(self, name, attrs, selfClosing=0):
1328: #print "Start tag %s: %s" % (name, attrs)
1329: if self.quoteStack:
1330: #This is not a real tag.
1331: #print "<%s> is not real!" % name
1332: attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333: self.handle_data('<%s%s>' % (name, attrs))
1334: return
1335: self.endData()
1336:
1337: if not self.isSelfClosingTag(name) and not selfClosing:
1338: self._smartPop(name)
1339:
1340: if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341: and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342: return
1343:
1344: tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345: if self.previous:
1346: self.previous.next = tag
1347: self.previous = tag
1348: self.pushTag(tag)
1349: if selfClosing or self.isSelfClosingTag(name):
1350: self.popTag()
1351: if name in self.QUOTE_TAGS:
1352: #print "Beginning quote (%s)" % name
1353: self.quoteStack.append(name)
1354: self.literal = 1
1355: return tag
1356:
1357: def unknown_endtag(self, name):
1358: #print "End tag %s" % name
1359: if self.quoteStack and self.quoteStack[-1] != name:
1360: #This is not a real end tag.
1361: #print "</%s> is not real!" % name
1362: self.handle_data('</%s>' % name)
1363: return
1364: self.endData()
1365: self._popToTag(name)
1366: if self.quoteStack and self.quoteStack[-1] == name:
1367: self.quoteStack.pop()
1368: self.literal = (len(self.quoteStack) > 0)
1369:
1370: def handle_data(self, data):
1371: self.currentData.append(data)
1372:
1373: def _toStringSubclass(self, text, subclass):
1374: """Adds a certain piece of text to the tree as a NavigableString
1375: subclass."""
1376: self.endData()
1377: self.handle_data(text)
1378: self.endData(subclass)
1379:
1380: def handle_pi(self, text):
1381: """Handle a processing instruction as a ProcessingInstruction
1382: object, possibly one with a %SOUP-ENCODING% slot into which an
1383: encoding will be plugged later."""
1384: if text[:3] == "xml":
1385: text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386: self._toStringSubclass(text, ProcessingInstruction)
1387:
1388: def handle_comment(self, text):
1389: "Handle comments as Comment objects."
1390: self._toStringSubclass(text, Comment)
1391:
1392: def handle_charref(self, ref):
1393: "Handle character references as data."
1394: if self.convertEntities:
1395: data = unichr(int(ref))
1396: else:
1397: data = '&#%s;' % ref
1398: self.handle_data(data)
1399:
1400: def handle_entityref(self, ref):
1401: """Handle entity references as data, possibly converting known
1402: HTML and/or XML entity references to the corresponding Unicode
1403: characters."""
1404: data = None
1405: if self.convertHTMLEntities:
1406: try:
1407: data = unichr(name2codepoint[ref])
1408: except KeyError:
1409: pass
1410:
1411: if not data and self.convertXMLEntities:
1412: data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413:
1414: if not data and self.convertHTMLEntities and \
1415: not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416: # TODO: We've got a problem here. We're told this is
1417: # an entity reference, but it's not an XML entity
1418: # reference or an HTML entity reference. Nonetheless,
1419: # the logical thing to do is to pass it through as an
1420: # unrecognized entity reference.
1421: #
1422: # Except: when the input is "&carol;" this function
1423: # will be called with input "carol". When the input is
1424: # "AT&T", this function will be called with input
1425: # "T". We have no way of knowing whether a semicolon
1426: # was present originally, so we don't know whether
1427: # this is an unknown entity or just a misplaced
1428: # ampersand.
1429: #
1430: # The more common case is a misplaced ampersand, so I
1431: # escape the ampersand and omit the trailing semicolon.
1432: data = "&%s" % ref
1433: if not data:
1434: # This case is different from the one above, because we
1435: # haven't already gone through a supposedly comprehensive
1436: # mapping of entities to Unicode characters. We might not
1437: # have gone through any mapping at all. So the chances are
1438: # very high that this is a real entity, and not a
1439: # misplaced ampersand.
1440: data = "&%s;" % ref
1441: self.handle_data(data)
1442:
1443: def handle_decl(self, data):
1444: "Handle DOCTYPEs and the like as Declaration objects."
1445: self._toStringSubclass(data, Declaration)
1446:
1447: def parse_declaration(self, i):
1448: """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449: declaration as a CData object."""
1450: j = None
1451: if self.rawdata[i:i+9] == '<![CDATA[':
1452: k = self.rawdata.find(']]>', i)
1453: if k == -1:
1454: k = len(self.rawdata)
1455: data = self.rawdata[i+9:k]
1456: j = k+3
1457: self._toStringSubclass(data, CData)
1458: else:
1459: try:
1460: j = SGMLParser.parse_declaration(self, i)
1461: except SGMLParseError:
1462: toHandle = self.rawdata[i:]
1463: self.handle_data(toHandle)
1464: j = i + len(toHandle)
1465: return j
1466:
1467: class BeautifulSoup(BeautifulStoneSoup):
1468:
1469: """This parser knows the following facts about HTML:
1470:
1471: * Some tags have no closing tag and should be interpreted as being
1472: closed as soon as they are encountered.
1473:
1474: * The text inside some tags (ie. 'script') may contain tags which
1475: are not really part of the document and which should be parsed
1476: as text, not tags. If you want to parse the text as tags, you can
1477: always fetch it and parse it explicitly.
1478:
1479: * Tag nesting rules:
1480:
1481: Most tags can't be nested at all. For instance, the occurance of
1482: a <p> tag should implicitly close the previous <p> tag.
1483:
1484: <p>Para1<p>Para2
1485: should be transformed into:
1486: <p>Para1</p><p>Para2
1487:
1488: Some tags can be nested arbitrarily. For instance, the occurance
1489: of a <blockquote> tag should _not_ implicitly close the previous
1490: <blockquote> tag.
1491:
1492: Alice said: <blockquote>Bob said: <blockquote>Blah
1493: should NOT be transformed into:
1494: Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1495:
1496: Some tags can be nested, but the nesting is reset by the
1497: interposition of other tags. For instance, a <tr> tag should
1498: implicitly close the previous <tr> tag within the same <table>,
1499: but not close a <tr> tag in another table.
1500:
1501: <table><tr>Blah<tr>Blah
1502: should be transformed into:
1503: <table><tr>Blah</tr><tr>Blah
1504: but,
1505: <tr>Blah<table><tr>Blah
1506: should NOT be transformed into
1507: <tr>Blah<table></tr><tr>Blah
1508:
1509: Differing assumptions about tag nesting rules are a major source
1510: of problems with the BeautifulSoup class. If BeautifulSoup is not
1511: treating as nestable a tag your page author treats as nestable,
1512: try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513: BeautifulStoneSoup before writing your own subclass."""
1514:
1515: def __init__(self, *args, **kwargs):
1516: if not kwargs.has_key('smartQuotesTo'):
1517: kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518: kwargs['isHTML'] = True
1519: BeautifulStoneSoup.__init__(self, *args, **kwargs)
1520:
1521: SELF_CLOSING_TAGS = buildTagMap(None,
1522: ('br' , 'hr', 'input', 'img', 'meta',
1523: 'spacer', 'link', 'frame', 'base', 'col'))
1524:
1525: PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1526:
1527: QUOTE_TAGS = {'script' : None, 'textarea' : None}
1528:
1529: #According to the HTML standard, each of these inline tags can
1530: #contain another tag of the same type. Furthermore, it's common
1531: #to actually use these tags this way.
1532: NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1533: 'center')
1534:
1535: #According to the HTML standard, these block tags can contain
1536: #another tag of the same type. Furthermore, it's common
1537: #to actually use these tags this way.
1538: NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1539:
1540: #Lists can contain other lists, but there are restrictions.
1541: NESTABLE_LIST_TAGS = { 'ol' : [],
1542: 'ul' : [],
1543: 'li' : ['ul', 'ol'],
1544: 'dl' : [],
1545: 'dd' : ['dl'],
1546: 'dt' : ['dl'] }
1547:
1548: #Tables can contain other tables, but there are restrictions.
1549: NESTABLE_TABLE_TAGS = {'table' : [],
1550: 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1551: 'td' : ['tr'],
1552: 'th' : ['tr'],
1553: 'thead' : ['table'],
1554: 'tbody' : ['table'],
1555: 'tfoot' : ['table'],
1556: }
1557:
1558: NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1559:
1560: #If one of these tags is encountered, all tags up to the next tag of
1561: #this type are popped.
1562: RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563: NON_NESTABLE_BLOCK_TAGS,
1564: NESTABLE_LIST_TAGS,
1565: NESTABLE_TABLE_TAGS)
1566:
1567: NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568: NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1569:
1570: # Used to detect the charset in a META tag; see start_meta
1571: CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1572:
1573: def start_meta(self, attrs):
1574: """Beautiful Soup can detect a charset included in a META tag,
1575: try to convert the document to that charset, and re-parse the
1576: document from the beginning."""
1577: httpEquiv = None
1578: contentType = None
1579: contentTypeIndex = None
1580: tagNeedsEncodingSubstitution = False
1581:
1582: for i in range(0, len(attrs)):
1583: key, value = attrs[i]
1584: key = key.lower()
1585: if key == 'http-equiv':
1586: httpEquiv = value
1587: elif key == 'content':
1588: contentType = value
1589: contentTypeIndex = i
1590:
1591: if httpEquiv and contentType: # It's an interesting meta tag.
1592: match = self.CHARSET_RE.search(contentType)
1593: if match:
1594: if (self.declaredHTMLEncoding is not None or
1595: self.originalEncoding == self.fromEncoding):
1596: # An HTML encoding was sniffed while converting
1597: # the document to Unicode, or an HTML encoding was
1598: # sniffed during a previous pass through the
1599: # document, or an encoding was specified
1600: # explicitly and it worked. Rewrite the meta tag.
1601: def rewrite(match):
1602: return match.group(1) + "%SOUP-ENCODING%"
1603: newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604: attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1605: newAttr)
1606: tagNeedsEncodingSubstitution = True
1607: else:
1608: # This is our first pass through the document.
1609: # Go through it again with the encoding information.
1610: newCharset = match.group(3)
1611: if newCharset and newCharset != self.originalEncoding:
1612: self.declaredHTMLEncoding = newCharset
1613: self._feed(self.declaredHTMLEncoding)
1614: raise StopParsing
1615: pass
1616: tag = self.unknown_starttag("meta", attrs)
1617: if tag and tagNeedsEncodingSubstitution:
1618: tag.containsSubstitutions = True
1619:
1620: class StopParsing(Exception):
1621: pass
1622:
1623: class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1624:
1625: """The BeautifulSoup class is oriented towards skipping over
1626: common HTML errors like unclosed tags. However, sometimes it makes
1627: errors of its own. For instance, consider this fragment:
1628:
1629: <b>Foo<b>Bar</b></b>
1630:
1631: This is perfectly valid (if bizarre) HTML. However, the
1632: BeautifulSoup class will implicitly close the first b tag when it
1633: encounters the second 'b'. It will think the author wrote
1634: "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635: there's no real-world reason to bold something that's already
1636: bold. When it encounters '</b></b>' it will close two more 'b'
1637: tags, for a grand total of three tags closed instead of two. This
1638: can throw off the rest of your document structure. The same is
1639: true of a number of other tags, listed below.
1640:
1641: It's much more common for someone to forget to close a 'b' tag
1642: than to actually use nested 'b' tags, and the BeautifulSoup class
1643: handles the common case. This class handles the not-co-common
1644: case: where you can't believe someone wrote what they did, but
1645: it's valid HTML and BeautifulSoup screwed up by assuming it
1646: wouldn't be."""
1647:
1648: I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649: ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650: 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1651: 'big')
1652:
1653: I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1654:
1655: NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656: I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657: I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1658:
1659: class MinimalSoup(BeautifulSoup):
1660: """The MinimalSoup class is for parsing HTML that contains
1661: pathologically bad markup. It makes no assumptions about tag
1662: nesting, but it does know which tags are self-closing, that
1663: <script> tags contain Javascript and should not be parsed, that
1664: META tags may contain encoding information, and so on.
1665:
1666: This also makes it better for subclassing than BeautifulStoneSoup
1667: or BeautifulSoup."""
1668:
1669: RESET_NESTING_TAGS = buildTagMap('noscript')
1670: NESTABLE_TAGS = {}
1671:
1672: class BeautifulSOAP(BeautifulStoneSoup):
1673: """This class will push a tag with only a single string child into
1674: the tag's parent as an attribute. The attribute's name is the tag
1675: name, and the value is the string child. An example should give
1676: the flavor of the change:
1677:
1678: <foo><bar>baz</bar></foo>
1679: =>
1680: <foo bar="baz"><bar>baz</bar></foo>
1681:
1682: You can then access fooTag['bar'] instead of fooTag.barTag.string.
1683:
1684: This is, of course, useful for scraping structures that tend to
1685: use subelements instead of attributes, such as SOAP messages. Note
1686: that it modifies its input, so don't print the modified version
1687: out.
1688:
1689: I'm not sure how many people really want to use this class; let me
1690: know if you do. Mainly I like the name."""
1691:
1692: def popTag(self):
1693: if len(self.tagStack) > 1:
1694: tag = self.tagStack[-1]
1695: parent = self.tagStack[-2]
1696: parent._getAttrMap()
1697: if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698: isinstance(tag.contents[0], NavigableString) and
1699: not parent.attrMap.has_key(tag.name)):
1700: parent[tag.name] = tag.contents[0]
1701: BeautifulStoneSoup.popTag(self)
1702:
1703: #Enterprise class names! It has come to our attention that some people
1704: #think the names of the Beautiful Soup parser classes are too silly
1705: #and "unprofessional" for use in enterprise screen-scraping. We feel
1706: #your pain! For such-minded folk, the Beautiful Soup Consortium And
1707: #All-Night Kosher Bakery recommends renaming this file to
1708: #"RobustParser.py" (or, in cases of extreme enterprisiness,
1709: #"RobustParserBeanInterface.class") and using the following
1710: #enterprise-friendly class aliases:
1711: class RobustXMLParser(BeautifulStoneSoup):
1712: pass
1713: class RobustHTMLParser(BeautifulSoup):
1714: pass
1715: class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1716: pass
1717: class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1718: pass
1719: class SimplifyingSOAPParser(BeautifulSOAP):
1720: pass
1721:
1722: ######################################################
1723: #
1724: # Bonus library: Unicode, Dammit
1725: #
1726: # This class forces XML data into a standard format (usually to UTF-8
1727: # or Unicode). It is heavily based on code from Mark Pilgrim's
1728: # Universal Feed Parser. It does not rewrite the XML or HTML to
1729: # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730: # (XML) and BeautifulSoup.start_meta (HTML).
1731:
1732: # Autodetects character encodings.
1733: # Download from http://chardet.feedparser.org/
1734: try:
1735: import chardet
1736: # import chardet.constants
1737: # chardet.constants._debug = 1
1738: except ImportError:
1739: chardet = None
1740:
1741: # cjkcodecs and iconv_codec make Python know about more character encodings.
1742: # Both are available from http://cjkpython.i18n.org/
1743: # They're built in if you use Python 2.4.
1744: try:
1745: import cjkcodecs.aliases
1746: except ImportError:
1747: pass
1748: try:
1749: import iconv_codec
1750: except ImportError:
1751: pass
1752:
1753: class UnicodeDammit:
1754: """A class for detecting the encoding of a *ML document and
1755: converting it to a Unicode string. If the source encoding is
1756: windows-1252, can replace MS smart quotes with their HTML or XML
1757: equivalents."""
1758:
1759: # This dictionary maps commonly seen values for "charset" in HTML
1760: # meta tags to the corresponding Python codec names. It only covers
1761: # values that aren't in Python's aliases and can't be determined
1762: # by the heuristics in find_codec.
1763: CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764: "x-sjis" : "shift-jis" }
1765:
1766: def __init__(self, markup, overrideEncodings=[],
1767: smartQuotesTo='xml', isHTML=False):
1768: self.declaredHTMLEncoding = None
1769: self.markup, documentEncoding, sniffedEncoding = \
1770: self._detectEncoding(markup, isHTML)
1771: self.smartQuotesTo = smartQuotesTo
1772: self.triedEncodings = []
1773: if markup == '' or isinstance(markup, unicode):
1774: self.originalEncoding = None
1775: self.unicode = unicode(markup)
1776: return
1777:
1778: u = None
1779: for proposedEncoding in overrideEncodings:
1780: u = self._convertFrom(proposedEncoding)
1781: if u: break
1782: if not u:
1783: for proposedEncoding in (documentEncoding, sniffedEncoding):
1784: u = self._convertFrom(proposedEncoding)
1785: if u: break
1786:
1787: # If no luck and we have auto-detection library, try that:
1788: if not u and chardet and not isinstance(self.markup, unicode):
1789: u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1790:
1791: # As a last resort, try utf-8 and windows-1252:
1792: if not u:
1793: for proposed_encoding in ("utf-8", "windows-1252"):
1794: u = self._convertFrom(proposed_encoding)
1795: if u: break
1796:
1797: self.unicode = u
1798: if not u: self.originalEncoding = None
1799:
1800: def _subMSChar(self, orig):
1801: """Changes a MS smart quote character to an XML or HTML
1802: entity."""
1803: sub = self.MS_CHARS.get(orig)
1804: if isinstance(sub, tuple):
1805: if self.smartQuotesTo == 'xml':
1806: sub = '&#x%s;' % sub[1]
1807: else:
1808: sub = '&%s;' % sub[0]
1809: return sub
1810:
1811: def _convertFrom(self, proposed):
1812: proposed = self.find_codec(proposed)
1813: if not proposed or proposed in self.triedEncodings:
1814: return None
1815: self.triedEncodings.append(proposed)
1816: markup = self.markup
1817:
1818: # Convert smart quotes to HTML if coming from an encoding
1819: # that might have them.
1820: if self.smartQuotesTo and proposed.lower() in("windows-1252",
1821: "iso-8859-1",
1822: "iso-8859-2"):
1823: markup = re.compile("([\x80-\x9f])").sub \
1824: (lambda(x): self._subMSChar(x.group(1)),
1825: markup)
1826:
1827: try:
1828: # print "Trying to convert document to %s" % proposed
1829: u = self._toUnicode(markup, proposed)
1830: self.markup = u
1831: self.originalEncoding = proposed
1832: except Exception, e:
1833: # print "That didn't work!"
1834: # print e
1835: return None
1836: #print "Correct encoding: %s" % proposed
1837: return self.markup
1838:
1839: def _toUnicode(self, data, encoding):
1840: '''Given a string and its encoding, decodes the string into Unicode.
1841: %encoding is a string recognized by encodings.aliases'''
1842:
1843: # strip Byte Order Mark (if present)
1844: if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845: and (data[2:4] != '\x00\x00'):
1846: encoding = 'utf-16be'
1847: data = data[2:]
1848: elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849: and (data[2:4] != '\x00\x00'):
1850: encoding = 'utf-16le'
1851: data = data[2:]
1852: elif data[:3] == '\xef\xbb\xbf':
1853: encoding = 'utf-8'
1854: data = data[3:]
1855: elif data[:4] == '\x00\x00\xfe\xff':
1856: encoding = 'utf-32be'
1857: data = data[4:]
1858: elif data[:4] == '\xff\xfe\x00\x00':
1859: encoding = 'utf-32le'
1860: data = data[4:]
1861: newdata = unicode(data, encoding)
1862: return newdata
1863:
1864: def _detectEncoding(self, xml_data, isHTML=False):
1865: """Given a document, tries to detect its XML encoding."""
1866: xml_encoding = sniffed_xml_encoding = None
1867: try:
1868: if xml_data[:4] == '\x4c\x6f\xa7\x94':
1869: # EBCDIC
1870: xml_data = self._ebcdic_to_ascii(xml_data)
1871: elif xml_data[:4] == '\x00\x3c\x00\x3f':
1872: # UTF-16BE
1873: sniffed_xml_encoding = 'utf-16be'
1874: xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875: elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876: and (xml_data[2:4] != '\x00\x00'):
1877: # UTF-16BE with BOM
1878: sniffed_xml_encoding = 'utf-16be'
1879: xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880: elif xml_data[:4] == '\x3c\x00\x3f\x00':
1881: # UTF-16LE
1882: sniffed_xml_encoding = 'utf-16le'
1883: xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884: elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885: (xml_data[2:4] != '\x00\x00'):
1886: # UTF-16LE with BOM
1887: sniffed_xml_encoding = 'utf-16le'
1888: xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889: elif xml_data[:4] == '\x00\x00\x00\x3c':
1890: # UTF-32BE
1891: sniffed_xml_encoding = 'utf-32be'
1892: xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893: elif xml_data[:4] == '\x3c\x00\x00\x00':
1894: # UTF-32LE
1895: sniffed_xml_encoding = 'utf-32le'
1896: xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897: elif xml_data[:4] == '\x00\x00\xfe\xff':
1898: # UTF-32BE with BOM
1899: sniffed_xml_encoding = 'utf-32be'
1900: xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901: elif xml_data[:4] == '\xff\xfe\x00\x00':
1902: # UTF-32LE with BOM
1903: sniffed_xml_encoding = 'utf-32le'
1904: xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905: elif xml_data[:3] == '\xef\xbb\xbf':
1906: # UTF-8 with BOM
1907: sniffed_xml_encoding = 'utf-8'
1908: xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1909: else:
1910: sniffed_xml_encoding = 'ascii'
1911: pass
1912: except:
1913: xml_encoding_match = None
1914: xml_encoding_match = re.compile(
1915: '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916: if not xml_encoding_match and isHTML:
1917: regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918: xml_encoding_match = regexp.search(xml_data)
1919: if xml_encoding_match is not None:
1920: xml_encoding = xml_encoding_match.groups()[0].lower()
1921: if isHTML:
1922: self.declaredHTMLEncoding = xml_encoding
1923: if sniffed_xml_encoding and \
1924: (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925: 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926: 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1927: 'utf16', 'u16')):
1928: xml_encoding = sniffed_xml_encoding
1929: return xml_data, xml_encoding, sniffed_xml_encoding
1930:
1931:
1932: def find_codec(self, charset):
1933: return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934: or (charset and self._codec(charset.replace("-", ""))) \
1935: or (charset and self._codec(charset.replace("-", "_"))) \
1936: or charset
1937:
1938: def _codec(self, charset):
1939: if not charset: return charset
1940: codec = None
1941: try:
1942: codecs.lookup(charset)
1943: codec = charset
1944: except (LookupError, ValueError):
1945: pass
1946: return codec
1947:
1948: EBCDIC_TO_ASCII_MAP = None
1949: def _ebcdic_to_ascii(self, s):
1950: c = self.__class__
1951: if not c.EBCDIC_TO_ASCII_MAP:
1952: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953: 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954: 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955: 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956: 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957: 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958: 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959: 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960: 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961: 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962: 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963: 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964: 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965: 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966: 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967: 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968: 250,251,252,253,254,255)
1969: import string
1970: c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971: ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972: return s.translate(c.EBCDIC_TO_ASCII_MAP)
1973:
1974: MS_CHARS = { '\x80' : ('euro', '20AC'),
1975: '\x81' : ' ',
1976: '\x82' : ('sbquo', '201A'),
1977: '\x83' : ('fnof', '192'),
1978: '\x84' : ('bdquo', '201E'),
1979: '\x85' : ('hellip', '2026'),
1980: '\x86' : ('dagger', '2020'),
1981: '\x87' : ('Dagger', '2021'),
1982: '\x88' : ('circ', '2C6'),
1983: '\x89' : ('permil', '2030'),
1984: '\x8A' : ('Scaron', '160'),
1985: '\x8B' : ('lsaquo', '2039'),
1986: '\x8C' : ('OElig', '152'),
1987: '\x8D' : '?',
1988: '\x8E' : ('#x17D', '17D'),
1989: '\x8F' : '?',
1990: '\x90' : '?',
1991: '\x91' : ('lsquo', '2018'),
1992: '\x92' : ('rsquo', '2019'),
1993: '\x93' : ('ldquo', '201C'),
1994: '\x94' : ('rdquo', '201D'),
1995: '\x95' : ('bull', '2022'),
1996: '\x96' : ('ndash', '2013'),
1997: '\x97' : ('mdash', '2014'),
1998: '\x98' : ('tilde', '2DC'),
1999: '\x99' : ('trade', '2122'),
2000: '\x9a' : ('scaron', '161'),
2001: '\x9b' : ('rsaquo', '203A'),
2002: '\x9c' : ('oelig', '153'),
2003: '\x9d' : '?',
2004: '\x9e' : ('#x17E', '17E'),
2005: '\x9f' : ('Yuml', ''),}
2006:
2007: #######################################################################
2008:
2009:
2010: #By default, act as an HTML pretty-printer.
2011: if __name__ == '__main__':
2012: import sys
2013: soup = BeautifulSoup(sys.stdin)
2014: print soup.prettify()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>