MPIWGWeb/BeautifulSoup.py - view

File: [Repository] / MPIWGWeb / Attic / BeautifulSoup.py
Revision 1.1.2.1: download - view: text, annotated - select for diffs - revision graph
Fri Jan 6 08:23:10 2012 UTC (12 years, 6 months ago) by dwinter
Branches: r2

new search

1: """Beautiful Soup 2: Elixir and Tonic 3: "The Screen-Scraper's Friend" 4: http://www.crummy.com/software/BeautifulSoup/ 5: 6: Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7: tree representation. It provides methods and Pythonic idioms that make 8: it easy to navigate, search, and modify the tree. 9: 10: A well-formed XML/HTML document yields a well-formed data 11: structure. An ill-formed XML/HTML document yields a correspondingly 12: ill-formed data structure. If your document is only locally 13: well-formed, you can use this library to find and process the 14: well-formed part of it. 15: 16: Beautiful Soup works with Python 2.2 and up. It has no external 17: dependencies, but you'll have more success at converting data to UTF-8 18: if you also install these three packages: 19: 20: * chardet, for auto-detecting character encodings 21: http://chardet.feedparser.org/ 22: * cjkcodecs and iconv_codec, which add more encodings to the ones supported 23: by stock Python. 24: http://cjkpython.i18n.org/ 25: 26: Beautiful Soup defines classes for two main parsing strategies: 27: 28: * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29: language that kind of looks like XML. 30: 31: * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32: or invalid. This class has web browser-like heuristics for 33: obtaining a sensible parse tree in the face of common HTML errors. 34: 35: Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36: the encoding of an HTML or XML document, and converting it to 37: Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38: 39: For more than you ever wanted to know about Beautiful Soup, see the 40: documentation: 41: http://www.crummy.com/software/BeautifulSoup/documentation.html 42: 43: Here, have some legalese: 44: 45: Copyright (c) 2004-2010, Leonard Richardson 46: 47: All rights reserved. 48: 49: Redistribution and use in source and binary forms, with or without 50: modification, are permitted provided that the following conditions are 51: met: 52: 53: * Redistributions of source code must retain the above copyright 54: notice, this list of conditions and the following disclaimer. 55: 56: * Redistributions in binary form must reproduce the above 57: copyright notice, this list of conditions and the following 58: disclaimer in the documentation and/or other materials provided 59: with the distribution. 60: 61: * Neither the name of the the Beautiful Soup Consortium and All 62: Night Kosher Bakery nor the names of its contributors may be 63: used to endorse or promote products derived from this software 64: without specific prior written permission. 65: 66: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73: PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77: 78: """ 79: from __future__ import generators 80: 81: __author__ = "Leonard Richardson (leonardr@segfault.org)" 82: __version__ = "3.2.0" 83: __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" 84: __license__ = "New-style BSD" 85: 86: from sgmllib import SGMLParser, SGMLParseError 87: import codecs 88: import markupbase 89: import types 90: import re 91: import sgmllib 92: try: 93: from htmlentitydefs import name2codepoint 94: except ImportError: 95: name2codepoint = {} 96: try: 97: set 98: except NameError: 99: from sets import Set as set 100: 101: #These hacks make Beautiful Soup able to parse XML with namespaces 102: sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 103: markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 104: 105: DEFAULT_OUTPUT_ENCODING = "utf-8" 106: 107: def _match_css_class(str): 108: """Build a RE to match the given CSS class.""" 109: return re.compile(r"(^|.*\s)%s($|\s)" % str) 110: 111: # First, the classes that represent markup elements. 112: 113: class PageElement(object): 114: """Contains the navigational information for some part of the page 115: (either a tag or a piece of text)""" 116: 117: def setup(self, parent=None, previous=None): 118: """Sets up the initial relations between this element and 119: other elements.""" 120: self.parent = parent 121: self.previous = previous 122: self.next = None 123: self.previousSibling = None 124: self.nextSibling = None 125: if self.parent and self.parent.contents: 126: self.previousSibling = self.parent.contents[-1] 127: self.previousSibling.nextSibling = self 128: 129: def replaceWith(self, replaceWith): 130: oldParent = self.parent 131: myIndex = self.parent.index(self) 132: if hasattr(replaceWith, "parent")\ 133: and replaceWith.parent is self.parent: 134: # We're replacing this element with one of its siblings. 135: index = replaceWith.parent.index(replaceWith) 136: if index and index < myIndex: 137: # Furthermore, it comes before this element. That 138: # means that when we extract it, the index of this 139: # element will change. 140: myIndex = myIndex - 1 141: self.extract() 142: oldParent.insert(myIndex, replaceWith) 143: 144: def replaceWithChildren(self): 145: myParent = self.parent 146: myIndex = self.parent.index(self) 147: self.extract() 148: reversedChildren = list(self.contents) 149: reversedChildren.reverse() 150: for child in reversedChildren: 151: myParent.insert(myIndex, child) 152: 153: def extract(self): 154: """Destructively rips this element out of the tree.""" 155: if self.parent: 156: try: 157: del self.parent.contents[self.parent.index(self)] 158: except ValueError: 159: pass 160: 161: #Find the two elements that would be next to each other if 162: #this element (and any children) hadn't been parsed. Connect 163: #the two. 164: lastChild = self._lastRecursiveChild() 165: nextElement = lastChild.next 166: 167: if self.previous: 168: self.previous.next = nextElement 169: if nextElement: 170: nextElement.previous = self.previous 171: self.previous = None 172: lastChild.next = None 173: 174: self.parent = None 175: if self.previousSibling: 176: self.previousSibling.nextSibling = self.nextSibling 177: if self.nextSibling: 178: self.nextSibling.previousSibling = self.previousSibling 179: self.previousSibling = self.nextSibling = None 180: return self 181: 182: def _lastRecursiveChild(self): 183: "Finds the last element beneath this object to be parsed." 184: lastChild = self 185: while hasattr(lastChild, 'contents') and lastChild.contents: 186: lastChild = lastChild.contents[-1] 187: return lastChild 188: 189: def insert(self, position, newChild): 190: if isinstance(newChild, basestring) \ 191: and not isinstance(newChild, NavigableString): 192: newChild = NavigableString(newChild) 193: 194: position = min(position, len(self.contents)) 195: if hasattr(newChild, 'parent') and newChild.parent is not None: 196: # We're 'inserting' an element that's already one 197: # of this object's children. 198: if newChild.parent is self: 199: index = self.index(newChild) 200: if index > position: 201: # Furthermore we're moving it further down the 202: # list of this object's children. That means that 203: # when we extract this element, our target index 204: # will jump down one. 205: position = position - 1 206: newChild.extract() 207: 208: newChild.parent = self 209: previousChild = None 210: if position == 0: 211: newChild.previousSibling = None 212: newChild.previous = self 213: else: 214: previousChild = self.contents[position-1] 215: newChild.previousSibling = previousChild 216: newChild.previousSibling.nextSibling = newChild 217: newChild.previous = previousChild._lastRecursiveChild() 218: if newChild.previous: 219: newChild.previous.next = newChild 220: 221: newChildsLastElement = newChild._lastRecursiveChild() 222: 223: if position >= len(self.contents): 224: newChild.nextSibling = None 225: 226: parent = self 227: parentsNextSibling = None 228: while not parentsNextSibling: 229: parentsNextSibling = parent.nextSibling 230: parent = parent.parent 231: if not parent: # This is the last element in the document. 232: break 233: if parentsNextSibling: 234: newChildsLastElement.next = parentsNextSibling 235: else: 236: newChildsLastElement.next = None 237: else: 238: nextChild = self.contents[position] 239: newChild.nextSibling = nextChild 240: if newChild.nextSibling: 241: newChild.nextSibling.previousSibling = newChild 242: newChildsLastElement.next = nextChild 243: 244: if newChildsLastElement.next: 245: newChildsLastElement.next.previous = newChildsLastElement 246: self.contents.insert(position, newChild) 247: 248: def append(self, tag): 249: """Appends the given tag to the contents of this tag.""" 250: self.insert(len(self.contents), tag) 251: 252: def findNext(self, name=None, attrs={}, text=None, **kwargs): 253: """Returns the first item that matches the given criteria and 254: appears after this Tag in the document.""" 255: return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 256: 257: def findAllNext(self, name=None, attrs={}, text=None, limit=None, 258: **kwargs): 259: """Returns all items that match the given criteria and appear 260: after this Tag in the document.""" 261: return self._findAll(name, attrs, text, limit, self.nextGenerator, 262: **kwargs) 263: 264: def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 265: """Returns the closest sibling to this Tag that matches the 266: given criteria and appears after this Tag in the document.""" 267: return self._findOne(self.findNextSiblings, name, attrs, text, 268: **kwargs) 269: 270: def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 271: **kwargs): 272: """Returns the siblings of this Tag that match the given 273: criteria and appear after this Tag in the document.""" 274: return self._findAll(name, attrs, text, limit, 275: self.nextSiblingGenerator, **kwargs) 276: fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 277: 278: def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 279: """Returns the first item that matches the given criteria and 280: appears before this Tag in the document.""" 281: return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 282: 283: def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 284: **kwargs): 285: """Returns all items that match the given criteria and appear 286: before this Tag in the document.""" 287: return self._findAll(name, attrs, text, limit, self.previousGenerator, 288: **kwargs) 289: fetchPrevious = findAllPrevious # Compatibility with pre-3.x 290: 291: def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 292: """Returns the closest sibling to this Tag that matches the 293: given criteria and appears before this Tag in the document.""" 294: return self._findOne(self.findPreviousSiblings, name, attrs, text, 295: **kwargs) 296: 297: def findPreviousSiblings(self, name=None, attrs={}, text=None, 298: limit=None, **kwargs): 299: """Returns the siblings of this Tag that match the given 300: criteria and appear before this Tag in the document.""" 301: return self._findAll(name, attrs, text, limit, 302: self.previousSiblingGenerator, **kwargs) 303: fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 304: 305: def findParent(self, name=None, attrs={}, **kwargs): 306: """Returns the closest parent of this Tag that matches the given 307: criteria.""" 308: # NOTE: We can't use _findOne because findParents takes a different 309: # set of arguments. 310: r = None 311: l = self.findParents(name, attrs, 1) 312: if l: 313: r = l[0] 314: return r 315: 316: def findParents(self, name=None, attrs={}, limit=None, **kwargs): 317: """Returns the parents of this Tag that match the given 318: criteria.""" 319: 320: return self._findAll(name, attrs, None, limit, self.parentGenerator, 321: **kwargs) 322: fetchParents = findParents # Compatibility with pre-3.x 323: 324: #These methods do the real heavy lifting. 325: 326: def _findOne(self, method, name, attrs, text, **kwargs): 327: r = None 328: l = method(name, attrs, text, 1, **kwargs) 329: if l: 330: r = l[0] 331: return r 332: 333: def _findAll(self, name, attrs, text, limit, generator, **kwargs): 334: "Iterates over a generator looking for things that match." 335: 336: if isinstance(name, SoupStrainer): 337: strainer = name 338: # (Possibly) special case some findAll*(...) searches 339: elif text is None and not limit and not attrs and not kwargs: 340: # findAll*(True) 341: if name is True: 342: return [element for element in generator() 343: if isinstance(element, Tag)] 344: # findAll*('tag-name') 345: elif isinstance(name, basestring): 346: return [element for element in generator() 347: if isinstance(element, Tag) and 348: element.name == name] 349: else: 350: strainer = SoupStrainer(name, attrs, text, **kwargs) 351: # Build a SoupStrainer 352: else: 353: strainer = SoupStrainer(name, attrs, text, **kwargs) 354: results = ResultSet(strainer) 355: g = generator() 356: while True: 357: try: 358: i = g.next() 359: except StopIteration: 360: break 361: if i: 362: found = strainer.search(i) 363: if found: 364: results.append(found) 365: if limit and len(results) >= limit: 366: break 367: return results 368: 369: #These Generators can be used to navigate starting from both 370: #NavigableStrings and Tags. 371: def nextGenerator(self): 372: i = self 373: while i is not None: 374: i = i.next 375: yield i 376: 377: def nextSiblingGenerator(self): 378: i = self 379: while i is not None: 380: i = i.nextSibling 381: yield i 382: 383: def previousGenerator(self): 384: i = self 385: while i is not None: 386: i = i.previous 387: yield i 388: 389: def previousSiblingGenerator(self): 390: i = self 391: while i is not None: 392: i = i.previousSibling 393: yield i 394: 395: def parentGenerator(self): 396: i = self 397: while i is not None: 398: i = i.parent 399: yield i 400: 401: # Utility methods 402: def substituteEncoding(self, str, encoding=None): 403: encoding = encoding or "utf-8" 404: return str.replace("%SOUP-ENCODING%", encoding) 405: 406: def toEncoding(self, s, encoding=None): 407: """Encodes an object to a string in some encoding, or to Unicode. 408: .""" 409: if isinstance(s, unicode): 410: if encoding: 411: s = s.encode(encoding) 412: elif isinstance(s, str): 413: if encoding: 414: s = s.encode(encoding) 415: else: 416: s = unicode(s) 417: else: 418: if encoding: 419: s = self.toEncoding(str(s), encoding) 420: else: 421: s = unicode(s) 422: return s 423: 424: class NavigableString(unicode, PageElement): 425: 426: def __new__(cls, value): 427: """Create a new NavigableString. 428: 429: When unpickling a NavigableString, this method is called with 430: the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 431: passed in to the superclass's __new__ or the superclass won't know 432: how to handle non-ASCII characters. 433: """ 434: if isinstance(value, unicode): 435: return unicode.__new__(cls, value) 436: return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 437: 438: def __getnewargs__(self): 439: return (NavigableString.__str__(self),) 440: 441: def __getattr__(self, attr): 442: """text.string gives you text. This is for backwards 443: compatibility for Navigable*String, but for CData* it lets you 444: get the string without the CData wrapper.""" 445: if attr == 'string': 446: return self 447: else: 448: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 449: 450: def __unicode__(self): 451: return str(self).decode(DEFAULT_OUTPUT_ENCODING) 452: 453: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 454: if encoding: 455: return self.encode(encoding) 456: else: 457: return self 458: 459: class CData(NavigableString): 460: 461: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 462: return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) 463: 464: class ProcessingInstruction(NavigableString): 465: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 466: output = self 467: if "%SOUP-ENCODING%" in output: 468: output = self.substituteEncoding(output, encoding) 469: return "<?%s?>" % self.toEncoding(output, encoding) 470: 471: class Comment(NavigableString): 472: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 473: return "" % NavigableString.__str__(self, encoding) 474: 475: class Declaration(NavigableString): 476: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 477: return "<!%s>" % NavigableString.__str__(self, encoding) 478: 479: class Tag(PageElement): 480: 481: """Represents a found HTML tag with its attributes and contents.""" 482: 483: def _invert(h): 484: "Cheap function to invert a hash." 485: i = {} 486: for k,v in h.items(): 487: i[v] = k 488: return i 489: 490: XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 491: "quot" : '"', 492: "amp" : "&", 493: "lt" : "<", 494: "gt" : ">" } 495: 496: XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 497: 498: def _convertEntities(self, match): 499: """Used in a call to re.sub to replace HTML, XML, and numeric 500: entities with the appropriate Unicode characters. If HTML 501: entities are being converted, any unrecognized entities are 502: escaped.""" 503: x = match.group(1) 504: if self.convertHTMLEntities and x in name2codepoint: 505: return unichr(name2codepoint[x]) 506: elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 507: if self.convertXMLEntities: 508: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 509: else: 510: return u'&%s;' % x 511: elif len(x) > 0 and x[0] == '#': 512: # Handle numeric entities 513: if len(x) > 1 and x[1] == 'x': 514: return unichr(int(x[2:], 16)) 515: else: 516: return unichr(int(x[1:])) 517: 518: elif self.escapeUnrecognizedEntities: 519: return u'&%s;' % x 520: else: 521: return u'&%s;' % x 522: 523: def __init__(self, parser, name, attrs=None, parent=None, 524: previous=None): 525: "Basic constructor." 526: 527: # We don't actually store the parser object: that lets extracted 528: # chunks be garbage-collected 529: self.parserClass = parser.__class__ 530: self.isSelfClosing = parser.isSelfClosingTag(name) 531: self.name = name 532: if attrs is None: 533: attrs = [] 534: elif isinstance(attrs, dict): 535: attrs = attrs.items() 536: self.attrs = attrs 537: self.contents = [] 538: self.setup(parent, previous) 539: self.hidden = False 540: self.containsSubstitutions = False 541: self.convertHTMLEntities = parser.convertHTMLEntities 542: self.convertXMLEntities = parser.convertXMLEntities 543: self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 544: 545: # Convert any HTML, XML, or numeric entities in the attribute values. 546: convert = lambda(k, val): (k, 547: re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 548: self._convertEntities, 549: val)) 550: self.attrs = map(convert, self.attrs) 551: 552: def getString(self): 553: if (len(self.contents) == 1 554: and isinstance(self.contents[0], NavigableString)): 555: return self.contents[0] 556: 557: def setString(self, string): 558: """Replace the contents of the tag with a string""" 559: self.clear() 560: self.append(string) 561: 562: string = property(getString, setString) 563: 564: def getText(self, separator=u""): 565: if not len(self.contents): 566: return u"" 567: stopNode = self._lastRecursiveChild().next 568: strings = [] 569: current = self.contents[0] 570: while current is not stopNode: 571: if isinstance(current, NavigableString): 572: strings.append(current.strip()) 573: current = current.next 574: return separator.join(strings) 575: 576: text = property(getText) 577: 578: def get(self, key, default=None): 579: """Returns the value of the 'key' attribute for the tag, or 580: the value given for 'default' if it doesn't have that 581: attribute.""" 582: return self._getAttrMap().get(key, default) 583: 584: def clear(self): 585: """Extract all children.""" 586: for child in self.contents[:]: 587: child.extract() 588: 589: def index(self, element): 590: for i, child in enumerate(self.contents): 591: if child is element: 592: return i 593: raise ValueError("Tag.index: element not in tag") 594: 595: def has_key(self, key): 596: return self._getAttrMap().has_key(key) 597: 598: def __getitem__(self, key): 599: """tag[key] returns the value of the 'key' attribute for the tag, 600: and throws an exception if it's not there.""" 601: return self._getAttrMap()[key] 602: 603: def __iter__(self): 604: "Iterating over a tag iterates over its contents." 605: return iter(self.contents) 606: 607: def __len__(self): 608: "The length of a tag is the length of its list of contents." 609: return len(self.contents) 610: 611: def __contains__(self, x): 612: return x in self.contents 613: 614: def __nonzero__(self): 615: "A tag is non-None even if it has no contents." 616: return True 617: 618: def __setitem__(self, key, value): 619: """Setting tag[key] sets the value of the 'key' attribute for the 620: tag.""" 621: self._getAttrMap() 622: self.attrMap[key] = value 623: found = False 624: for i in range(0, len(self.attrs)): 625: if self.attrs[i][0] == key: 626: self.attrs[i] = (key, value) 627: found = True 628: if not found: 629: self.attrs.append((key, value)) 630: self._getAttrMap()[key] = value 631: 632: def __delitem__(self, key): 633: "Deleting tag[key] deletes all 'key' attributes for the tag." 634: for item in self.attrs: 635: if item[0] == key: 636: self.attrs.remove(item) 637: #We don't break because bad HTML can define the same 638: #attribute multiple times. 639: self._getAttrMap() 640: if self.attrMap.has_key(key): 641: del self.attrMap[key] 642: 643: def __call__(self, *args, **kwargs): 644: """Calling a tag like a function is the same as calling its 645: findAll() method. Eg. tag('a') returns a list of all the A tags 646: found within this tag.""" 647: return apply(self.findAll, args, kwargs) 648: 649: def __getattr__(self, tag): 650: #print "Getattr %s.%s" % (self.__class__, tag) 651: if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 652: return self.find(tag[:-3]) 653: elif tag.find('__') != 0: 654: return self.find(tag) 655: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) 656: 657: def __eq__(self, other): 658: """Returns true iff this tag has the same name, the same attributes, 659: and the same contents (recursively) as the given tag. 660: 661: NOTE: right now this will return false if two tags have the 662: same attributes in a different order. Should this be fixed?""" 663: if other is self: 664: return True 665: if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 666: return False 667: for i in range(0, len(self.contents)): 668: if self.contents[i] != other.contents[i]: 669: return False 670: return True 671: 672: def __ne__(self, other): 673: """Returns true iff this tag is not identical to the other tag, 674: as defined in __eq__.""" 675: return not self == other 676: 677: def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 678: """Renders this tag as a string.""" 679: return self.__str__(encoding) 680: 681: def __unicode__(self): 682: return self.__str__(None) 683: 684: BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 685: + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 686: + ")") 687: 688: def _sub_entity(self, x): 689: """Used with a regular expression to substitute the 690: appropriate XML entity for an XML special character.""" 691: return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 692: 693: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 694: prettyPrint=False, indentLevel=0): 695: """Returns a string or Unicode representation of this tag and 696: its contents. To get Unicode, pass None for encoding. 697: 698: NOTE: since Python's HTML parser consumes whitespace, this 699: method is not certain to reproduce the whitespace present in 700: the original string.""" 701: 702: encodedName = self.toEncoding(self.name, encoding) 703: 704: attrs = [] 705: if self.attrs: 706: for key, val in self.attrs: 707: fmt = '%s="%s"' 708: if isinstance(val, basestring): 709: if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 710: val = self.substituteEncoding(val, encoding) 711: 712: # The attribute value either: 713: # 714: # * Contains no embedded double quotes or single quotes. 715: # No problem: we enclose it in double quotes. 716: # * Contains embedded single quotes. No problem: 717: # double quotes work here too. 718: # * Contains embedded double quotes. No problem: 719: # we enclose it in single quotes. 720: # * Embeds both single _and_ double quotes. This 721: # can't happen naturally, but it can happen if 722: # you modify an attribute value after parsing 723: # the document. Now we have a bit of a 724: # problem. We solve it by enclosing the 725: # attribute in single quotes, and escaping any 726: # embedded single quotes to XML entities. 727: if '"' in val: 728: fmt = "%s='%s'" 729: if "'" in val: 730: # TODO: replace with apos when 731: # appropriate. 732: val = val.replace("'", "&squot;") 733: 734: # Now we're okay w/r/t quotes. But the attribute 735: # value might also contain angle brackets, or 736: # ampersands that aren't part of entities. We need 737: # to escape those to XML entities too. 738: val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 739: 740: attrs.append(fmt % (self.toEncoding(key, encoding), 741: self.toEncoding(val, encoding))) 742: close = '' 743: closeTag = '' 744: if self.isSelfClosing: 745: close = ' /' 746: else: 747: closeTag = '</%s>' % encodedName 748: 749: indentTag, indentContents = 0, 0 750: if prettyPrint: 751: indentTag = indentLevel 752: space = (' ' * (indentTag-1)) 753: indentContents = indentTag + 1 754: contents = self.renderContents(encoding, prettyPrint, indentContents) 755: if self.hidden: 756: s = contents 757: else: 758: s = [] 759: attributeString = '' 760: if attrs: 761: attributeString = ' ' + ' '.join(attrs) 762: if prettyPrint: 763: s.append(space) 764: s.append('<%s%s%s>' % (encodedName, attributeString, close)) 765: if prettyPrint: 766: s.append("\n") 767: s.append(contents) 768: if prettyPrint and contents and contents[-1] != "\n": 769: s.append("\n") 770: if prettyPrint and closeTag: 771: s.append(space) 772: s.append(closeTag) 773: if prettyPrint and closeTag and self.nextSibling: 774: s.append("\n") 775: s = ''.join(s) 776: return s 777: 778: def decompose(self): 779: """Recursively destroys the contents of this tree.""" 780: self.extract() 781: if len(self.contents) == 0: 782: return 783: current = self.contents[0] 784: while current is not None: 785: next = current.next 786: if isinstance(current, Tag): 787: del current.contents[:] 788: current.parent = None 789: current.previous = None 790: current.previousSibling = None 791: current.next = None 792: current.nextSibling = None 793: current = next 794: 795: def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 796: return self.__str__(encoding, True) 797: 798: def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 799: prettyPrint=False, indentLevel=0): 800: """Renders the contents of this tag as a string in the given 801: encoding. If encoding is None, returns a Unicode string..""" 802: s=[] 803: for c in self: 804: text = None 805: if isinstance(c, NavigableString): 806: text = c.__str__(encoding) 807: elif isinstance(c, Tag): 808: s.append(c.__str__(encoding, prettyPrint, indentLevel)) 809: if text and prettyPrint: 810: text = text.strip() 811: if text: 812: if prettyPrint: 813: s.append(" " * (indentLevel-1)) 814: s.append(text) 815: if prettyPrint: 816: s.append("\n") 817: return ''.join(s) 818: 819: #Soup methods 820: 821: def find(self, name=None, attrs={}, recursive=True, text=None, 822: **kwargs): 823: """Return only the first child of this Tag matching the given 824: criteria.""" 825: r = None 826: l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 827: if l: 828: r = l[0] 829: return r 830: findChild = find 831: 832: def findAll(self, name=None, attrs={}, recursive=True, text=None, 833: limit=None, **kwargs): 834: """Extracts a list of Tag objects that match the given 835: criteria. You can specify the name of the Tag and any 836: attributes you want the Tag to have. 837: 838: The value of a key-value pair in the 'attrs' map can be a 839: string, a list of strings, a regular expression object, or a 840: callable that takes a string and returns whether or not the 841: string matches for some custom definition of 'matches'. The 842: same is true of the tag name.""" 843: generator = self.recursiveChildGenerator 844: if not recursive: 845: generator = self.childGenerator 846: return self._findAll(name, attrs, text, limit, generator, **kwargs) 847: findChildren = findAll 848: 849: # Pre-3.x compatibility methods 850: first = find 851: fetch = findAll 852: 853: def fetchText(self, text=None, recursive=True, limit=None): 854: return self.findAll(text=text, recursive=recursive, limit=limit) 855: 856: def firstText(self, text=None, recursive=True): 857: return self.find(text=text, recursive=recursive) 858: 859: #Private methods 860: 861: def _getAttrMap(self): 862: """Initializes a map representation of this tag's attributes, 863: if not already initialized.""" 864: if not getattr(self, 'attrMap'): 865: self.attrMap = {} 866: for (key, value) in self.attrs: 867: self.attrMap[key] = value 868: return self.attrMap 869: 870: #Generator methods 871: def childGenerator(self): 872: # Just use the iterator from the contents 873: return iter(self.contents) 874: 875: def recursiveChildGenerator(self): 876: if not len(self.contents): 877: raise StopIteration 878: stopNode = self._lastRecursiveChild().next 879: current = self.contents[0] 880: while current is not stopNode: 881: yield current 882: current = current.next 883: 884: 885: # Next, a couple classes to represent queries and their results. 886: class SoupStrainer: 887: """Encapsulates a number of ways of matching a markup element (tag or 888: text).""" 889: 890: def __init__(self, name=None, attrs={}, text=None, **kwargs): 891: self.name = name 892: if isinstance(attrs, basestring): 893: kwargs['class'] = _match_css_class(attrs) 894: attrs = None 895: if kwargs: 896: if attrs: 897: attrs = attrs.copy() 898: attrs.update(kwargs) 899: else: 900: attrs = kwargs 901: self.attrs = attrs 902: self.text = text 903: 904: def __str__(self): 905: if self.text: 906: return self.text 907: else: 908: return "%s|%s" % (self.name, self.attrs) 909: 910: def searchTag(self, markupName=None, markupAttrs={}): 911: found = None 912: markup = None 913: if isinstance(markupName, Tag): 914: markup = markupName 915: markupAttrs = markup 916: callFunctionWithTagData = callable(self.name) \ 917: and not isinstance(markupName, Tag) 918: 919: if (not self.name) \ 920: or callFunctionWithTagData \ 921: or (markup and self._matches(markup, self.name)) \ 922: or (not markup and self._matches(markupName, self.name)): 923: if callFunctionWithTagData: 924: match = self.name(markupName, markupAttrs) 925: else: 926: match = True 927: markupAttrMap = None 928: for attr, matchAgainst in self.attrs.items(): 929: if not markupAttrMap: 930: if hasattr(markupAttrs, 'get'): 931: markupAttrMap = markupAttrs 932: else: 933: markupAttrMap = {} 934: for k,v in markupAttrs: 935: markupAttrMap[k] = v 936: attrValue = markupAttrMap.get(attr) 937: if not self._matches(attrValue, matchAgainst): 938: match = False 939: break 940: if match: 941: if markup: 942: found = markup 943: else: 944: found = markupName 945: return found 946: 947: def search(self, markup): 948: #print 'looking for %s in %s' % (self, markup) 949: found = None 950: # If given a list of items, scan it for a text element that 951: # matches. 952: if hasattr(markup, "__iter__") \ 953: and not isinstance(markup, Tag): 954: for element in markup: 955: if isinstance(element, NavigableString) \ 956: and self.search(element): 957: found = element 958: break 959: # If it's a Tag, make sure its name or attributes match. 960: # Don't bother with Tags if we're searching for text. 961: elif isinstance(markup, Tag): 962: if not self.text: 963: found = self.searchTag(markup) 964: # If it's text, make sure the text matches. 965: elif isinstance(markup, NavigableString) or \ 966: isinstance(markup, basestring): 967: if self._matches(markup, self.text): 968: found = markup 969: else: 970: raise Exception, "I don't know how to match against a %s" \ 971: % markup.__class__ 972: return found 973: 974: def _matches(self, markup, matchAgainst): 975: #print "Matching %s against %s" % (markup, matchAgainst) 976: result = False 977: if matchAgainst is True: 978: result = markup is not None 979: elif callable(matchAgainst): 980: result = matchAgainst(markup) 981: else: 982: #Custom match methods take the tag as an argument, but all 983: #other ways of matching match the tag name as a string. 984: if isinstance(markup, Tag): 985: markup = markup.name 986: if markup and not isinstance(markup, basestring): 987: markup = unicode(markup) 988: #Now we know that chunk is either a string, or None. 989: if hasattr(matchAgainst, 'match'): 990: # It's a regexp object. 991: result = markup and matchAgainst.search(markup) 992: elif hasattr(matchAgainst, '__iter__'): # list-like 993: result = markup in matchAgainst 994: elif hasattr(matchAgainst, 'items'): 995: result = markup.has_key(matchAgainst) 996: elif matchAgainst and isinstance(markup, basestring): 997: if isinstance(markup, unicode): 998: matchAgainst = unicode(matchAgainst) 999: else: 1000: matchAgainst = str(matchAgainst) 1001: 1002: if not result: 1003: result = matchAgainst == markup 1004: return result 1005: 1006: class ResultSet(list): 1007: """A ResultSet is just a list that keeps track of the SoupStrainer 1008: that created it.""" 1009: def __init__(self, source): 1010: list.__init__([]) 1011: self.source = source 1012: 1013: # Now, some helper functions. 1014: 1015: def buildTagMap(default, *args): 1016: """Turns a list of maps, lists, or scalars into a single map. 1017: Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1018: NESTING_RESET_TAGS maps out of lists and partial maps.""" 1019: built = {} 1020: for portion in args: 1021: if hasattr(portion, 'items'): 1022: #It's a map. Merge it. 1023: for k,v in portion.items(): 1024: built[k] = v 1025: elif hasattr(portion, '__iter__'): # is a list 1026: #It's a list. Map each item to the default. 1027: for k in portion: 1028: built[k] = default 1029: else: 1030: #It's a scalar. Map it to the default. 1031: built[portion] = default 1032: return built 1033: 1034: # Now, the parser classes. 1035: 1036: class BeautifulStoneSoup(Tag, SGMLParser): 1037: 1038: """This class contains the basic parser and search code. It defines 1039: a parser that knows nothing about tag behavior except for the 1040: following: 1041: 1042: You can't close a tag without closing all the tags it encloses. 1043: That is, "<foo><bar></foo>" actually means 1044: "<foo><bar></bar></foo>". 1045: 1046: [Another possible explanation is "<foo><bar /></foo>", but since 1047: this class defines no SELF_CLOSING_TAGS, it will never use that 1048: explanation.] 1049: 1050: This class is useful for parsing XML or made-up markup languages, 1051: or when BeautifulSoup makes an assumption counter to what you were 1052: expecting.""" 1053: 1054: SELF_CLOSING_TAGS = {} 1055: NESTABLE_TAGS = {} 1056: RESET_NESTING_TAGS = {} 1057: QUOTE_TAGS = {} 1058: PRESERVE_WHITESPACE_TAGS = [] 1059: 1060: MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1061: lambda x: x.group(1) + ' />'), 1062: (re.compile('<!\s+([^<>]*)>'), 1063: lambda x: '<!' + x.group(1) + '>') 1064: ] 1065: 1066: ROOT_TAG_NAME = u'[document]' 1067: 1068: HTML_ENTITIES = "html" 1069: XML_ENTITIES = "xml" 1070: XHTML_ENTITIES = "xhtml" 1071: # TODO: This only exists for backwards-compatibility 1072: ALL_ENTITIES = XHTML_ENTITIES 1073: 1074: # Used when determining whether a text node is all whitespace and 1075: # can be replaced with a single space. A text node that contains 1076: # fancy Unicode spaces (usually non-breaking) should be left 1077: # alone. 1078: STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1079: 1080: def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1081: markupMassage=True, smartQuotesTo=XML_ENTITIES, 1082: convertEntities=None, selfClosingTags=None, isHTML=False): 1083: """The Soup object is initialized as the 'root tag', and the 1084: provided markup (which can be a string or a file-like object) 1085: is fed into the underlying parser. 1086: 1087: sgmllib will process most bad HTML, and the BeautifulSoup 1088: class has some tricks for dealing with some HTML that kills 1089: sgmllib, but Beautiful Soup can nonetheless choke or lose data 1090: if your data uses self-closing tags or declarations 1091: incorrectly. 1092: 1093: By default, Beautiful Soup uses regexes to sanitize input, 1094: avoiding the vast majority of these problems. If the problems 1095: don't apply to you, pass in False for markupMassage, and 1096: you'll get better performance. 1097: 1098: The default parser massage techniques fix the two most common 1099: instances of invalid HTML that choke sgmllib: 1100: 1101: <br/> (No space between name of closing tag and tag close) 1102: <! --Comment--> (Extraneous whitespace in declaration) 1103: 1104: You can pass in a custom list of (RE object, replace method) 1105: tuples to get Beautiful Soup to scrub your input the way you 1106: want.""" 1107: 1108: self.parseOnlyThese = parseOnlyThese 1109: self.fromEncoding = fromEncoding 1110: self.smartQuotesTo = smartQuotesTo 1111: self.convertEntities = convertEntities 1112: # Set the rules for how we'll deal with the entities we 1113: # encounter 1114: if self.convertEntities: 1115: # It doesn't make sense to convert encoded characters to 1116: # entities even while you're converting entities to Unicode. 1117: # Just convert it all to Unicode. 1118: self.smartQuotesTo = None 1119: if convertEntities == self.HTML_ENTITIES: 1120: self.convertXMLEntities = False 1121: self.convertHTMLEntities = True 1122: self.escapeUnrecognizedEntities = True 1123: elif convertEntities == self.XHTML_ENTITIES: 1124: self.convertXMLEntities = True 1125: self.convertHTMLEntities = True 1126: self.escapeUnrecognizedEntities = False 1127: elif convertEntities == self.XML_ENTITIES: 1128: self.convertXMLEntities = True 1129: self.convertHTMLEntities = False 1130: self.escapeUnrecognizedEntities = False 1131: else: 1132: self.convertXMLEntities = False 1133: self.convertHTMLEntities = False 1134: self.escapeUnrecognizedEntities = False 1135: 1136: self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1137: SGMLParser.__init__(self) 1138: 1139: if hasattr(markup, 'read'): # It's a file-type object. 1140: markup = markup.read() 1141: self.markup = markup 1142: self.markupMassage = markupMassage 1143: try: 1144: self._feed(isHTML=isHTML) 1145: except StopParsing: 1146: pass 1147: self.markup = None # The markup can now be GCed 1148: 1149: def convert_charref(self, name): 1150: """This method fixes a bug in Python's SGMLParser.""" 1151: try: 1152: n = int(name) 1153: except ValueError: 1154: return 1155: if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1156: return 1157: return self.convert_codepoint(n) 1158: 1159: def _feed(self, inDocumentEncoding=None, isHTML=False): 1160: # Convert the document to Unicode. 1161: markup = self.markup 1162: if isinstance(markup, unicode): 1163: if not hasattr(self, 'originalEncoding'): 1164: self.originalEncoding = None 1165: else: 1166: dammit = UnicodeDammit\ 1167: (markup, [self.fromEncoding, inDocumentEncoding], 1168: smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1169: markup = dammit.unicode 1170: self.originalEncoding = dammit.originalEncoding 1171: self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1172: if markup: 1173: if self.markupMassage: 1174: if not hasattr(self.markupMassage, "__iter__"): 1175: self.markupMassage = self.MARKUP_MASSAGE 1176: for fix, m in self.markupMassage: 1177: markup = fix.sub(m, markup) 1178: # TODO: We get rid of markupMassage so that the 1179: # soup object can be deepcopied later on. Some 1180: # Python installations can't copy regexes. If anyone 1181: # was relying on the existence of markupMassage, this 1182: # might cause problems. 1183: del(self.markupMassage) 1184: self.reset() 1185: 1186: SGMLParser.feed(self, markup) 1187: # Close out any unfinished strings and close all the open tags. 1188: self.endData() 1189: while self.currentTag.name != self.ROOT_TAG_NAME: 1190: self.popTag() 1191: 1192: def __getattr__(self, methodName): 1193: """This method routes method call requests to either the SGMLParser 1194: superclass or the Tag superclass, depending on the method name.""" 1195: #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1196: 1197: if methodName.startswith('start_') or methodName.startswith('end_') \ 1198: or methodName.startswith('do_'): 1199: return SGMLParser.__getattr__(self, methodName) 1200: elif not methodName.startswith('__'): 1201: return Tag.__getattr__(self, methodName) 1202: else: 1203: raise AttributeError 1204: 1205: def isSelfClosingTag(self, name): 1206: """Returns true iff the given string is the name of a 1207: self-closing tag according to this parser.""" 1208: return self.SELF_CLOSING_TAGS.has_key(name) \ 1209: or self.instanceSelfClosingTags.has_key(name) 1210: 1211: def reset(self): 1212: Tag.__init__(self, self, self.ROOT_TAG_NAME) 1213: self.hidden = 1 1214: SGMLParser.reset(self) 1215: self.currentData = [] 1216: self.currentTag = None 1217: self.tagStack = [] 1218: self.quoteStack = [] 1219: self.pushTag(self) 1220: 1221: def popTag(self): 1222: tag = self.tagStack.pop() 1223: 1224: #print "Pop", tag.name 1225: if self.tagStack: 1226: self.currentTag = self.tagStack[-1] 1227: return self.currentTag 1228: 1229: def pushTag(self, tag): 1230: #print "Push", tag.name 1231: if self.currentTag: 1232: self.currentTag.contents.append(tag) 1233: self.tagStack.append(tag) 1234: self.currentTag = self.tagStack[-1] 1235: 1236: def endData(self, containerClass=NavigableString): 1237: if self.currentData: 1238: currentData = u''.join(self.currentData) 1239: if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1240: not set([tag.name for tag in self.tagStack]).intersection( 1241: self.PRESERVE_WHITESPACE_TAGS)): 1242: if '\n' in currentData: 1243: currentData = '\n' 1244: else: 1245: currentData = ' ' 1246: self.currentData = [] 1247: if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1248: (not self.parseOnlyThese.text or \ 1249: not self.parseOnlyThese.search(currentData)): 1250: return 1251: o = containerClass(currentData) 1252: o.setup(self.currentTag, self.previous) 1253: if self.previous: 1254: self.previous.next = o 1255: self.previous = o 1256: self.currentTag.contents.append(o) 1257: 1258: 1259: def _popToTag(self, name, inclusivePop=True): 1260: """Pops the tag stack up to and including the most recent 1261: instance of the given tag. If inclusivePop is false, pops the tag 1262: stack up to but *not* including the most recent instqance of 1263: the given tag.""" 1264: #print "Popping to %s" % name 1265: if name == self.ROOT_TAG_NAME: 1266: return 1267: 1268: numPops = 0 1269: mostRecentTag = None 1270: for i in range(len(self.tagStack)-1, 0, -1): 1271: if name == self.tagStack[i].name: 1272: numPops = len(self.tagStack)-i 1273: break 1274: if not inclusivePop: 1275: numPops = numPops - 1 1276: 1277: for i in range(0, numPops): 1278: mostRecentTag = self.popTag() 1279: return mostRecentTag 1280: 1281: def _smartPop(self, name): 1282: 1283: """We need to pop up to the previous tag of this type, unless 1284: one of this tag's nesting reset triggers comes between this 1285: tag and the previous tag of this type, OR unless this tag is a 1286: generic nesting trigger and another generic nesting trigger 1287: comes between this tag and the previous tag of this type. 1288: 1289: Examples: 1290: <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1291: <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1292: <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1293: 1294: <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1295: <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1296: <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1297: """ 1298: 1299: nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1300: isNestable = nestingResetTriggers != None 1301: isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1302: popTo = None 1303: inclusive = True 1304: for i in range(len(self.tagStack)-1, 0, -1): 1305: p = self.tagStack[i] 1306: if (not p or p.name == name) and not isNestable: 1307: #Non-nestable tags get popped to the top or to their 1308: #last occurance. 1309: popTo = name 1310: break 1311: if (nestingResetTriggers is not None 1312: and p.name in nestingResetTriggers) \ 1313: or (nestingResetTriggers is None and isResetNesting 1314: and self.RESET_NESTING_TAGS.has_key(p.name)): 1315: 1316: #If we encounter one of the nesting reset triggers 1317: #peculiar to this tag, or we encounter another tag 1318: #that causes nesting to reset, pop up to but not 1319: #including that tag. 1320: popTo = p.name 1321: inclusive = False 1322: break 1323: p = p.parent 1324: if popTo: 1325: self._popToTag(popTo, inclusive) 1326: 1327: def unknown_starttag(self, name, attrs, selfClosing=0): 1328: #print "Start tag %s: %s" % (name, attrs) 1329: if self.quoteStack: 1330: #This is not a real tag. 1331: #print "<%s> is not real!" % name 1332: attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1333: self.handle_data('<%s%s>' % (name, attrs)) 1334: return 1335: self.endData() 1336: 1337: if not self.isSelfClosingTag(name) and not selfClosing: 1338: self._smartPop(name) 1339: 1340: if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1341: and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1342: return 1343: 1344: tag = Tag(self, name, attrs, self.currentTag, self.previous) 1345: if self.previous: 1346: self.previous.next = tag 1347: self.previous = tag 1348: self.pushTag(tag) 1349: if selfClosing or self.isSelfClosingTag(name): 1350: self.popTag() 1351: if name in self.QUOTE_TAGS: 1352: #print "Beginning quote (%s)" % name 1353: self.quoteStack.append(name) 1354: self.literal = 1 1355: return tag 1356: 1357: def unknown_endtag(self, name): 1358: #print "End tag %s" % name 1359: if self.quoteStack and self.quoteStack[-1] != name: 1360: #This is not a real end tag. 1361: #print "</%s> is not real!" % name 1362: self.handle_data('</%s>' % name) 1363: return 1364: self.endData() 1365: self._popToTag(name) 1366: if self.quoteStack and self.quoteStack[-1] == name: 1367: self.quoteStack.pop() 1368: self.literal = (len(self.quoteStack) > 0) 1369: 1370: def handle_data(self, data): 1371: self.currentData.append(data) 1372: 1373: def _toStringSubclass(self, text, subclass): 1374: """Adds a certain piece of text to the tree as a NavigableString 1375: subclass.""" 1376: self.endData() 1377: self.handle_data(text) 1378: self.endData(subclass) 1379: 1380: def handle_pi(self, text): 1381: """Handle a processing instruction as a ProcessingInstruction 1382: object, possibly one with a %SOUP-ENCODING% slot into which an 1383: encoding will be plugged later.""" 1384: if text[:3] == "xml": 1385: text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1386: self._toStringSubclass(text, ProcessingInstruction) 1387: 1388: def handle_comment(self, text): 1389: "Handle comments as Comment objects." 1390: self._toStringSubclass(text, Comment) 1391: 1392: def handle_charref(self, ref): 1393: "Handle character references as data." 1394: if self.convertEntities: 1395: data = unichr(int(ref)) 1396: else: 1397: data = '&#%s;' % ref 1398: self.handle_data(data) 1399: 1400: def handle_entityref(self, ref): 1401: """Handle entity references as data, possibly converting known 1402: HTML and/or XML entity references to the corresponding Unicode 1403: characters.""" 1404: data = None 1405: if self.convertHTMLEntities: 1406: try: 1407: data = unichr(name2codepoint[ref]) 1408: except KeyError: 1409: pass 1410: 1411: if not data and self.convertXMLEntities: 1412: data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1413: 1414: if not data and self.convertHTMLEntities and \ 1415: not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1416: # TODO: We've got a problem here. We're told this is 1417: # an entity reference, but it's not an XML entity 1418: # reference or an HTML entity reference. Nonetheless, 1419: # the logical thing to do is to pass it through as an 1420: # unrecognized entity reference. 1421: # 1422: # Except: when the input is "&carol;" this function 1423: # will be called with input "carol". When the input is 1424: # "AT&T", this function will be called with input 1425: # "T". We have no way of knowing whether a semicolon 1426: # was present originally, so we don't know whether 1427: # this is an unknown entity or just a misplaced 1428: # ampersand. 1429: # 1430: # The more common case is a misplaced ampersand, so I 1431: # escape the ampersand and omit the trailing semicolon. 1432: data = "&%s" % ref 1433: if not data: 1434: # This case is different from the one above, because we 1435: # haven't already gone through a supposedly comprehensive 1436: # mapping of entities to Unicode characters. We might not 1437: # have gone through any mapping at all. So the chances are 1438: # very high that this is a real entity, and not a 1439: # misplaced ampersand. 1440: data = "&%s;" % ref 1441: self.handle_data(data) 1442: 1443: def handle_decl(self, data): 1444: "Handle DOCTYPEs and the like as Declaration objects." 1445: self._toStringSubclass(data, Declaration) 1446: 1447: def parse_declaration(self, i): 1448: """Treat a bogus SGML declaration as raw data. Treat a CDATA 1449: declaration as a CData object.""" 1450: j = None 1451: if self.rawdata[i:i+9] == '<![CDATA[': 1452: k = self.rawdata.find(']]>', i) 1453: if k == -1: 1454: k = len(self.rawdata) 1455: data = self.rawdata[i+9:k] 1456: j = k+3 1457: self._toStringSubclass(data, CData) 1458: else: 1459: try: 1460: j = SGMLParser.parse_declaration(self, i) 1461: except SGMLParseError: 1462: toHandle = self.rawdata[i:] 1463: self.handle_data(toHandle) 1464: j = i + len(toHandle) 1465: return j 1466: 1467: class BeautifulSoup(BeautifulStoneSoup): 1468: 1469: """This parser knows the following facts about HTML: 1470: 1471: * Some tags have no closing tag and should be interpreted as being 1472: closed as soon as they are encountered. 1473: 1474: * The text inside some tags (ie. 'script') may contain tags which 1475: are not really part of the document and which should be parsed 1476: as text, not tags. If you want to parse the text as tags, you can 1477: always fetch it and parse it explicitly. 1478: 1479: * Tag nesting rules: 1480: 1481: Most tags can't be nested at all. For instance, the occurance of 1482: a <p> tag should implicitly close the previous <p> tag. 1483: 1484: <p>Para1<p>Para2 1485: should be transformed into: 1486: <p>Para1</p><p>Para2 1487: 1488: Some tags can be nested arbitrarily. For instance, the occurance 1489: of a <blockquote> tag should _not_ implicitly close the previous 1490: <blockquote> tag. 1491: 1492: Alice said: <blockquote>Bob said: <blockquote>Blah 1493: should NOT be transformed into: 1494: Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1495: 1496: Some tags can be nested, but the nesting is reset by the 1497: interposition of other tags. For instance, a <tr> tag should 1498: implicitly close the previous <tr> tag within the same <table>, 1499: but not close a <tr> tag in another table. 1500: 1501: <table><tr>Blah<tr>Blah 1502: should be transformed into: 1503: <table><tr>Blah</tr><tr>Blah 1504: but, 1505: <tr>Blah<table><tr>Blah 1506: should NOT be transformed into 1507: <tr>Blah<table></tr><tr>Blah 1508: 1509: Differing assumptions about tag nesting rules are a major source 1510: of problems with the BeautifulSoup class. If BeautifulSoup is not 1511: treating as nestable a tag your page author treats as nestable, 1512: try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1513: BeautifulStoneSoup before writing your own subclass.""" 1514: 1515: def __init__(self, *args, **kwargs): 1516: if not kwargs.has_key('smartQuotesTo'): 1517: kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1518: kwargs['isHTML'] = True 1519: BeautifulStoneSoup.__init__(self, *args, **kwargs) 1520: 1521: SELF_CLOSING_TAGS = buildTagMap(None, 1522: ('br' , 'hr', 'input', 'img', 'meta', 1523: 'spacer', 'link', 'frame', 'base', 'col')) 1524: 1525: PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1526: 1527: QUOTE_TAGS = {'script' : None, 'textarea' : None} 1528: 1529: #According to the HTML standard, each of these inline tags can 1530: #contain another tag of the same type. Furthermore, it's common 1531: #to actually use these tags this way. 1532: NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1533: 'center') 1534: 1535: #According to the HTML standard, these block tags can contain 1536: #another tag of the same type. Furthermore, it's common 1537: #to actually use these tags this way. 1538: NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1539: 1540: #Lists can contain other lists, but there are restrictions. 1541: NESTABLE_LIST_TAGS = { 'ol' : [], 1542: 'ul' : [], 1543: 'li' : ['ul', 'ol'], 1544: 'dl' : [], 1545: 'dd' : ['dl'], 1546: 'dt' : ['dl'] } 1547: 1548: #Tables can contain other tables, but there are restrictions. 1549: NESTABLE_TABLE_TAGS = {'table' : [], 1550: 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1551: 'td' : ['tr'], 1552: 'th' : ['tr'], 1553: 'thead' : ['table'], 1554: 'tbody' : ['table'], 1555: 'tfoot' : ['table'], 1556: } 1557: 1558: NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1559: 1560: #If one of these tags is encountered, all tags up to the next tag of 1561: #this type are popped. 1562: RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1563: NON_NESTABLE_BLOCK_TAGS, 1564: NESTABLE_LIST_TAGS, 1565: NESTABLE_TABLE_TAGS) 1566: 1567: NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1568: NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1569: 1570: # Used to detect the charset in a META tag; see start_meta 1571: CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1572: 1573: def start_meta(self, attrs): 1574: """Beautiful Soup can detect a charset included in a META tag, 1575: try to convert the document to that charset, and re-parse the 1576: document from the beginning.""" 1577: httpEquiv = None 1578: contentType = None 1579: contentTypeIndex = None 1580: tagNeedsEncodingSubstitution = False 1581: 1582: for i in range(0, len(attrs)): 1583: key, value = attrs[i] 1584: key = key.lower() 1585: if key == 'http-equiv': 1586: httpEquiv = value 1587: elif key == 'content': 1588: contentType = value 1589: contentTypeIndex = i 1590: 1591: if httpEquiv and contentType: # It's an interesting meta tag. 1592: match = self.CHARSET_RE.search(contentType) 1593: if match: 1594: if (self.declaredHTMLEncoding is not None or 1595: self.originalEncoding == self.fromEncoding): 1596: # An HTML encoding was sniffed while converting 1597: # the document to Unicode, or an HTML encoding was 1598: # sniffed during a previous pass through the 1599: # document, or an encoding was specified 1600: # explicitly and it worked. Rewrite the meta tag. 1601: def rewrite(match): 1602: return match.group(1) + "%SOUP-ENCODING%" 1603: newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1604: attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1605: newAttr) 1606: tagNeedsEncodingSubstitution = True 1607: else: 1608: # This is our first pass through the document. 1609: # Go through it again with the encoding information. 1610: newCharset = match.group(3) 1611: if newCharset and newCharset != self.originalEncoding: 1612: self.declaredHTMLEncoding = newCharset 1613: self._feed(self.declaredHTMLEncoding) 1614: raise StopParsing 1615: pass 1616: tag = self.unknown_starttag("meta", attrs) 1617: if tag and tagNeedsEncodingSubstitution: 1618: tag.containsSubstitutions = True 1619: 1620: class StopParsing(Exception): 1621: pass 1622: 1623: class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1624: 1625: """The BeautifulSoup class is oriented towards skipping over 1626: common HTML errors like unclosed tags. However, sometimes it makes 1627: errors of its own. For instance, consider this fragment: 1628: 1629: <b>Foo<b>Bar</b></b> 1630: 1631: This is perfectly valid (if bizarre) HTML. However, the 1632: BeautifulSoup class will implicitly close the first b tag when it 1633: encounters the second 'b'. It will think the author wrote 1634: "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1635: there's no real-world reason to bold something that's already 1636: bold. When it encounters '</b></b>' it will close two more 'b' 1637: tags, for a grand total of three tags closed instead of two. This 1638: can throw off the rest of your document structure. The same is 1639: true of a number of other tags, listed below. 1640: 1641: It's much more common for someone to forget to close a 'b' tag 1642: than to actually use nested 'b' tags, and the BeautifulSoup class 1643: handles the common case. This class handles the not-co-common 1644: case: where you can't believe someone wrote what they did, but 1645: it's valid HTML and BeautifulSoup screwed up by assuming it 1646: wouldn't be.""" 1647: 1648: I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1649: ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1650: 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1651: 'big') 1652: 1653: I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1654: 1655: NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1656: I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1657: I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1658: 1659: class MinimalSoup(BeautifulSoup): 1660: """The MinimalSoup class is for parsing HTML that contains 1661: pathologically bad markup. It makes no assumptions about tag 1662: nesting, but it does know which tags are self-closing, that 1663: <script> tags contain Javascript and should not be parsed, that 1664: META tags may contain encoding information, and so on. 1665: 1666: This also makes it better for subclassing than BeautifulStoneSoup 1667: or BeautifulSoup.""" 1668: 1669: RESET_NESTING_TAGS = buildTagMap('noscript') 1670: NESTABLE_TAGS = {} 1671: 1672: class BeautifulSOAP(BeautifulStoneSoup): 1673: """This class will push a tag with only a single string child into 1674: the tag's parent as an attribute. The attribute's name is the tag 1675: name, and the value is the string child. An example should give 1676: the flavor of the change: 1677: 1678: <foo><bar>baz</bar></foo> 1679: => 1680: <foo bar="baz"><bar>baz</bar></foo> 1681: 1682: You can then access fooTag['bar'] instead of fooTag.barTag.string. 1683: 1684: This is, of course, useful for scraping structures that tend to 1685: use subelements instead of attributes, such as SOAP messages. Note 1686: that it modifies its input, so don't print the modified version 1687: out. 1688: 1689: I'm not sure how many people really want to use this class; let me 1690: know if you do. Mainly I like the name.""" 1691: 1692: def popTag(self): 1693: if len(self.tagStack) > 1: 1694: tag = self.tagStack[-1] 1695: parent = self.tagStack[-2] 1696: parent._getAttrMap() 1697: if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1698: isinstance(tag.contents[0], NavigableString) and 1699: not parent.attrMap.has_key(tag.name)): 1700: parent[tag.name] = tag.contents[0] 1701: BeautifulStoneSoup.popTag(self) 1702: 1703: #Enterprise class names! It has come to our attention that some people 1704: #think the names of the Beautiful Soup parser classes are too silly 1705: #and "unprofessional" for use in enterprise screen-scraping. We feel 1706: #your pain! For such-minded folk, the Beautiful Soup Consortium And 1707: #All-Night Kosher Bakery recommends renaming this file to 1708: #"RobustParser.py" (or, in cases of extreme enterprisiness, 1709: #"RobustParserBeanInterface.class") and using the following 1710: #enterprise-friendly class aliases: 1711: class RobustXMLParser(BeautifulStoneSoup): 1712: pass 1713: class RobustHTMLParser(BeautifulSoup): 1714: pass 1715: class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): 1716: pass 1717: class RobustInsanelyWackAssHTMLParser(MinimalSoup): 1718: pass 1719: class SimplifyingSOAPParser(BeautifulSOAP): 1720: pass 1721: 1722: ###################################################### 1723: # 1724: # Bonus library: Unicode, Dammit 1725: # 1726: # This class forces XML data into a standard format (usually to UTF-8 1727: # or Unicode). It is heavily based on code from Mark Pilgrim's 1728: # Universal Feed Parser. It does not rewrite the XML or HTML to 1729: # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1730: # (XML) and BeautifulSoup.start_meta (HTML). 1731: 1732: # Autodetects character encodings. 1733: # Download from http://chardet.feedparser.org/ 1734: try: 1735: import chardet 1736: # import chardet.constants 1737: # chardet.constants._debug = 1 1738: except ImportError: 1739: chardet = None 1740: 1741: # cjkcodecs and iconv_codec make Python know about more character encodings. 1742: # Both are available from http://cjkpython.i18n.org/ 1743: # They're built in if you use Python 2.4. 1744: try: 1745: import cjkcodecs.aliases 1746: except ImportError: 1747: pass 1748: try: 1749: import iconv_codec 1750: except ImportError: 1751: pass 1752: 1753: class UnicodeDammit: 1754: """A class for detecting the encoding of a *ML document and 1755: converting it to a Unicode string. If the source encoding is 1756: windows-1252, can replace MS smart quotes with their HTML or XML 1757: equivalents.""" 1758: 1759: # This dictionary maps commonly seen values for "charset" in HTML 1760: # meta tags to the corresponding Python codec names. It only covers 1761: # values that aren't in Python's aliases and can't be determined 1762: # by the heuristics in find_codec. 1763: CHARSET_ALIASES = { "macintosh" : "mac-roman", 1764: "x-sjis" : "shift-jis" } 1765: 1766: def __init__(self, markup, overrideEncodings=[], 1767: smartQuotesTo='xml', isHTML=False): 1768: self.declaredHTMLEncoding = None 1769: self.markup, documentEncoding, sniffedEncoding = \ 1770: self._detectEncoding(markup, isHTML) 1771: self.smartQuotesTo = smartQuotesTo 1772: self.triedEncodings = [] 1773: if markup == '' or isinstance(markup, unicode): 1774: self.originalEncoding = None 1775: self.unicode = unicode(markup) 1776: return 1777: 1778: u = None 1779: for proposedEncoding in overrideEncodings: 1780: u = self._convertFrom(proposedEncoding) 1781: if u: break 1782: if not u: 1783: for proposedEncoding in (documentEncoding, sniffedEncoding): 1784: u = self._convertFrom(proposedEncoding) 1785: if u: break 1786: 1787: # If no luck and we have auto-detection library, try that: 1788: if not u and chardet and not isinstance(self.markup, unicode): 1789: u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1790: 1791: # As a last resort, try utf-8 and windows-1252: 1792: if not u: 1793: for proposed_encoding in ("utf-8", "windows-1252"): 1794: u = self._convertFrom(proposed_encoding) 1795: if u: break 1796: 1797: self.unicode = u 1798: if not u: self.originalEncoding = None 1799: 1800: def _subMSChar(self, orig): 1801: """Changes a MS smart quote character to an XML or HTML 1802: entity.""" 1803: sub = self.MS_CHARS.get(orig) 1804: if isinstance(sub, tuple): 1805: if self.smartQuotesTo == 'xml': 1806: sub = '&#x%s;' % sub[1] 1807: else: 1808: sub = '&%s;' % sub[0] 1809: return sub 1810: 1811: def _convertFrom(self, proposed): 1812: proposed = self.find_codec(proposed) 1813: if not proposed or proposed in self.triedEncodings: 1814: return None 1815: self.triedEncodings.append(proposed) 1816: markup = self.markup 1817: 1818: # Convert smart quotes to HTML if coming from an encoding 1819: # that might have them. 1820: if self.smartQuotesTo and proposed.lower() in("windows-1252", 1821: "iso-8859-1", 1822: "iso-8859-2"): 1823: markup = re.compile("([\x80-\x9f])").sub \ 1824: (lambda(x): self._subMSChar(x.group(1)), 1825: markup) 1826: 1827: try: 1828: # print "Trying to convert document to %s" % proposed 1829: u = self._toUnicode(markup, proposed) 1830: self.markup = u 1831: self.originalEncoding = proposed 1832: except Exception, e: 1833: # print "That didn't work!" 1834: # print e 1835: return None 1836: #print "Correct encoding: %s" % proposed 1837: return self.markup 1838: 1839: def _toUnicode(self, data, encoding): 1840: '''Given a string and its encoding, decodes the string into Unicode. 1841: %encoding is a string recognized by encodings.aliases''' 1842: 1843: # strip Byte Order Mark (if present) 1844: if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1845: and (data[2:4] != '\x00\x00'): 1846: encoding = 'utf-16be' 1847: data = data[2:] 1848: elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1849: and (data[2:4] != '\x00\x00'): 1850: encoding = 'utf-16le' 1851: data = data[2:] 1852: elif data[:3] == '\xef\xbb\xbf': 1853: encoding = 'utf-8' 1854: data = data[3:] 1855: elif data[:4] == '\x00\x00\xfe\xff': 1856: encoding = 'utf-32be' 1857: data = data[4:] 1858: elif data[:4] == '\xff\xfe\x00\x00': 1859: encoding = 'utf-32le' 1860: data = data[4:] 1861: newdata = unicode(data, encoding) 1862: return newdata 1863: 1864: def _detectEncoding(self, xml_data, isHTML=False): 1865: """Given a document, tries to detect its XML encoding.""" 1866: xml_encoding = sniffed_xml_encoding = None 1867: try: 1868: if xml_data[:4] == '\x4c\x6f\xa7\x94': 1869: # EBCDIC 1870: xml_data = self._ebcdic_to_ascii(xml_data) 1871: elif xml_data[:4] == '\x00\x3c\x00\x3f': 1872: # UTF-16BE 1873: sniffed_xml_encoding = 'utf-16be' 1874: xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 1875: elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1876: and (xml_data[2:4] != '\x00\x00'): 1877: # UTF-16BE with BOM 1878: sniffed_xml_encoding = 'utf-16be' 1879: xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 1880: elif xml_data[:4] == '\x3c\x00\x3f\x00': 1881: # UTF-16LE 1882: sniffed_xml_encoding = 'utf-16le' 1883: xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 1884: elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1885: (xml_data[2:4] != '\x00\x00'): 1886: # UTF-16LE with BOM 1887: sniffed_xml_encoding = 'utf-16le' 1888: xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 1889: elif xml_data[:4] == '\x00\x00\x00\x3c': 1890: # UTF-32BE 1891: sniffed_xml_encoding = 'utf-32be' 1892: xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 1893: elif xml_data[:4] == '\x3c\x00\x00\x00': 1894: # UTF-32LE 1895: sniffed_xml_encoding = 'utf-32le' 1896: xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 1897: elif xml_data[:4] == '\x00\x00\xfe\xff': 1898: # UTF-32BE with BOM 1899: sniffed_xml_encoding = 'utf-32be' 1900: xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 1901: elif xml_data[:4] == '\xff\xfe\x00\x00': 1902: # UTF-32LE with BOM 1903: sniffed_xml_encoding = 'utf-32le' 1904: xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 1905: elif xml_data[:3] == '\xef\xbb\xbf': 1906: # UTF-8 with BOM 1907: sniffed_xml_encoding = 'utf-8' 1908: xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 1909: else: 1910: sniffed_xml_encoding = 'ascii' 1911: pass 1912: except: 1913: xml_encoding_match = None 1914: xml_encoding_match = re.compile( 1915: '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) 1916: if not xml_encoding_match and isHTML: 1917: regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) 1918: xml_encoding_match = regexp.search(xml_data) 1919: if xml_encoding_match is not None: 1920: xml_encoding = xml_encoding_match.groups()[0].lower() 1921: if isHTML: 1922: self.declaredHTMLEncoding = xml_encoding 1923: if sniffed_xml_encoding and \ 1924: (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1925: 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1926: 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1927: 'utf16', 'u16')): 1928: xml_encoding = sniffed_xml_encoding 1929: return xml_data, xml_encoding, sniffed_xml_encoding 1930: 1931: 1932: def find_codec(self, charset): 1933: return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1934: or (charset and self._codec(charset.replace("-", ""))) \ 1935: or (charset and self._codec(charset.replace("-", "_"))) \ 1936: or charset 1937: 1938: def _codec(self, charset): 1939: if not charset: return charset 1940: codec = None 1941: try: 1942: codecs.lookup(charset) 1943: codec = charset 1944: except (LookupError, ValueError): 1945: pass 1946: return codec 1947: 1948: EBCDIC_TO_ASCII_MAP = None 1949: def _ebcdic_to_ascii(self, s): 1950: c = self.__class__ 1951: if not c.EBCDIC_TO_ASCII_MAP: 1952: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1953: 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1954: 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1955: 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1956: 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1957: 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1958: 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1959: 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1960: 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1961: 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1962: 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1963: 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1964: 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1965: 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1966: 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1967: 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1968: 250,251,252,253,254,255) 1969: import string 1970: c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 1971: ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 1972: return s.translate(c.EBCDIC_TO_ASCII_MAP) 1973: 1974: MS_CHARS = { '\x80' : ('euro', '20AC'), 1975: '\x81' : ' ', 1976: '\x82' : ('sbquo', '201A'), 1977: '\x83' : ('fnof', '192'), 1978: '\x84' : ('bdquo', '201E'), 1979: '\x85' : ('hellip', '2026'), 1980: '\x86' : ('dagger', '2020'), 1981: '\x87' : ('Dagger', '2021'), 1982: '\x88' : ('circ', '2C6'), 1983: '\x89' : ('permil', '2030'), 1984: '\x8A' : ('Scaron', '160'), 1985: '\x8B' : ('lsaquo', '2039'), 1986: '\x8C' : ('OElig', '152'), 1987: '\x8D' : '?', 1988: '\x8E' : ('#x17D', '17D'), 1989: '\x8F' : '?', 1990: '\x90' : '?', 1991: '\x91' : ('lsquo', '2018'), 1992: '\x92' : ('rsquo', '2019'), 1993: '\x93' : ('ldquo', '201C'), 1994: '\x94' : ('rdquo', '201D'), 1995: '\x95' : ('bull', '2022'), 1996: '\x96' : ('ndash', '2013'), 1997: '\x97' : ('mdash', '2014'), 1998: '\x98' : ('tilde', '2DC'), 1999: '\x99' : ('trade', '2122'), 2000: '\x9a' : ('scaron', '161'), 2001: '\x9b' : ('rsaquo', '203A'), 2002: '\x9c' : ('oelig', '153'), 2003: '\x9d' : '?', 2004: '\x9e' : ('#x17E', '17E'), 2005: '\x9f' : ('Yuml', ''),} 2006: 2007: ####################################################################### 2008: 2009: 2010: #By default, act as an HTML pretty-printer. 2011: if __name__ == '__main__': 2012: import sys 2013: soup = BeautifulSoup(sys.stdin) 2014: print soup.prettify()