Annotation of MPIWGWeb/BeautifulSoup.py, revision 1.1.2.1
1.1.2.1 ! dwinter 1: """Beautiful Soup
! 2: Elixir and Tonic
! 3: "The Screen-Scraper's Friend"
! 4: http://www.crummy.com/software/BeautifulSoup/
! 5:
! 6: Beautiful Soup parses a (possibly invalid) XML or HTML document into a
! 7: tree representation. It provides methods and Pythonic idioms that make
! 8: it easy to navigate, search, and modify the tree.
! 9:
! 10: A well-formed XML/HTML document yields a well-formed data
! 11: structure. An ill-formed XML/HTML document yields a correspondingly
! 12: ill-formed data structure. If your document is only locally
! 13: well-formed, you can use this library to find and process the
! 14: well-formed part of it.
! 15:
! 16: Beautiful Soup works with Python 2.2 and up. It has no external
! 17: dependencies, but you'll have more success at converting data to UTF-8
! 18: if you also install these three packages:
! 19:
! 20: * chardet, for auto-detecting character encodings
! 21: http://chardet.feedparser.org/
! 22: * cjkcodecs and iconv_codec, which add more encodings to the ones supported
! 23: by stock Python.
! 24: http://cjkpython.i18n.org/
! 25:
! 26: Beautiful Soup defines classes for two main parsing strategies:
! 27:
! 28: * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
! 29: language that kind of looks like XML.
! 30:
! 31: * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
! 32: or invalid. This class has web browser-like heuristics for
! 33: obtaining a sensible parse tree in the face of common HTML errors.
! 34:
! 35: Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
! 36: the encoding of an HTML or XML document, and converting it to
! 37: Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
! 38:
! 39: For more than you ever wanted to know about Beautiful Soup, see the
! 40: documentation:
! 41: http://www.crummy.com/software/BeautifulSoup/documentation.html
! 42:
! 43: Here, have some legalese:
! 44:
! 45: Copyright (c) 2004-2010, Leonard Richardson
! 46:
! 47: All rights reserved.
! 48:
! 49: Redistribution and use in source and binary forms, with or without
! 50: modification, are permitted provided that the following conditions are
! 51: met:
! 52:
! 53: * Redistributions of source code must retain the above copyright
! 54: notice, this list of conditions and the following disclaimer.
! 55:
! 56: * Redistributions in binary form must reproduce the above
! 57: copyright notice, this list of conditions and the following
! 58: disclaimer in the documentation and/or other materials provided
! 59: with the distribution.
! 60:
! 61: * Neither the name of the the Beautiful Soup Consortium and All
! 62: Night Kosher Bakery nor the names of its contributors may be
! 63: used to endorse or promote products derived from this software
! 64: without specific prior written permission.
! 65:
! 66: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! 67: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! 68: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! 69: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
! 70: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
! 71: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
! 72: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
! 73: PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
! 74: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
! 75: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! 76: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
! 77:
! 78: """
! 79: from __future__ import generators
! 80:
! 81: __author__ = "Leonard Richardson (leonardr@segfault.org)"
! 82: __version__ = "3.2.0"
! 83: __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
! 84: __license__ = "New-style BSD"
! 85:
! 86: from sgmllib import SGMLParser, SGMLParseError
! 87: import codecs
! 88: import markupbase
! 89: import types
! 90: import re
! 91: import sgmllib
! 92: try:
! 93: from htmlentitydefs import name2codepoint
! 94: except ImportError:
! 95: name2codepoint = {}
! 96: try:
! 97: set
! 98: except NameError:
! 99: from sets import Set as set
! 100:
! 101: #These hacks make Beautiful Soup able to parse XML with namespaces
! 102: sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
! 103: markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
! 104:
! 105: DEFAULT_OUTPUT_ENCODING = "utf-8"
! 106:
! 107: def _match_css_class(str):
! 108: """Build a RE to match the given CSS class."""
! 109: return re.compile(r"(^|.*\s)%s($|\s)" % str)
! 110:
! 111: # First, the classes that represent markup elements.
! 112:
! 113: class PageElement(object):
! 114: """Contains the navigational information for some part of the page
! 115: (either a tag or a piece of text)"""
! 116:
! 117: def setup(self, parent=None, previous=None):
! 118: """Sets up the initial relations between this element and
! 119: other elements."""
! 120: self.parent = parent
! 121: self.previous = previous
! 122: self.next = None
! 123: self.previousSibling = None
! 124: self.nextSibling = None
! 125: if self.parent and self.parent.contents:
! 126: self.previousSibling = self.parent.contents[-1]
! 127: self.previousSibling.nextSibling = self
! 128:
! 129: def replaceWith(self, replaceWith):
! 130: oldParent = self.parent
! 131: myIndex = self.parent.index(self)
! 132: if hasattr(replaceWith, "parent")\
! 133: and replaceWith.parent is self.parent:
! 134: # We're replacing this element with one of its siblings.
! 135: index = replaceWith.parent.index(replaceWith)
! 136: if index and index < myIndex:
! 137: # Furthermore, it comes before this element. That
! 138: # means that when we extract it, the index of this
! 139: # element will change.
! 140: myIndex = myIndex - 1
! 141: self.extract()
! 142: oldParent.insert(myIndex, replaceWith)
! 143:
! 144: def replaceWithChildren(self):
! 145: myParent = self.parent
! 146: myIndex = self.parent.index(self)
! 147: self.extract()
! 148: reversedChildren = list(self.contents)
! 149: reversedChildren.reverse()
! 150: for child in reversedChildren:
! 151: myParent.insert(myIndex, child)
! 152:
! 153: def extract(self):
! 154: """Destructively rips this element out of the tree."""
! 155: if self.parent:
! 156: try:
! 157: del self.parent.contents[self.parent.index(self)]
! 158: except ValueError:
! 159: pass
! 160:
! 161: #Find the two elements that would be next to each other if
! 162: #this element (and any children) hadn't been parsed. Connect
! 163: #the two.
! 164: lastChild = self._lastRecursiveChild()
! 165: nextElement = lastChild.next
! 166:
! 167: if self.previous:
! 168: self.previous.next = nextElement
! 169: if nextElement:
! 170: nextElement.previous = self.previous
! 171: self.previous = None
! 172: lastChild.next = None
! 173:
! 174: self.parent = None
! 175: if self.previousSibling:
! 176: self.previousSibling.nextSibling = self.nextSibling
! 177: if self.nextSibling:
! 178: self.nextSibling.previousSibling = self.previousSibling
! 179: self.previousSibling = self.nextSibling = None
! 180: return self
! 181:
! 182: def _lastRecursiveChild(self):
! 183: "Finds the last element beneath this object to be parsed."
! 184: lastChild = self
! 185: while hasattr(lastChild, 'contents') and lastChild.contents:
! 186: lastChild = lastChild.contents[-1]
! 187: return lastChild
! 188:
! 189: def insert(self, position, newChild):
! 190: if isinstance(newChild, basestring) \
! 191: and not isinstance(newChild, NavigableString):
! 192: newChild = NavigableString(newChild)
! 193:
! 194: position = min(position, len(self.contents))
! 195: if hasattr(newChild, 'parent') and newChild.parent is not None:
! 196: # We're 'inserting' an element that's already one
! 197: # of this object's children.
! 198: if newChild.parent is self:
! 199: index = self.index(newChild)
! 200: if index > position:
! 201: # Furthermore we're moving it further down the
! 202: # list of this object's children. That means that
! 203: # when we extract this element, our target index
! 204: # will jump down one.
! 205: position = position - 1
! 206: newChild.extract()
! 207:
! 208: newChild.parent = self
! 209: previousChild = None
! 210: if position == 0:
! 211: newChild.previousSibling = None
! 212: newChild.previous = self
! 213: else:
! 214: previousChild = self.contents[position-1]
! 215: newChild.previousSibling = previousChild
! 216: newChild.previousSibling.nextSibling = newChild
! 217: newChild.previous = previousChild._lastRecursiveChild()
! 218: if newChild.previous:
! 219: newChild.previous.next = newChild
! 220:
! 221: newChildsLastElement = newChild._lastRecursiveChild()
! 222:
! 223: if position >= len(self.contents):
! 224: newChild.nextSibling = None
! 225:
! 226: parent = self
! 227: parentsNextSibling = None
! 228: while not parentsNextSibling:
! 229: parentsNextSibling = parent.nextSibling
! 230: parent = parent.parent
! 231: if not parent: # This is the last element in the document.
! 232: break
! 233: if parentsNextSibling:
! 234: newChildsLastElement.next = parentsNextSibling
! 235: else:
! 236: newChildsLastElement.next = None
! 237: else:
! 238: nextChild = self.contents[position]
! 239: newChild.nextSibling = nextChild
! 240: if newChild.nextSibling:
! 241: newChild.nextSibling.previousSibling = newChild
! 242: newChildsLastElement.next = nextChild
! 243:
! 244: if newChildsLastElement.next:
! 245: newChildsLastElement.next.previous = newChildsLastElement
! 246: self.contents.insert(position, newChild)
! 247:
! 248: def append(self, tag):
! 249: """Appends the given tag to the contents of this tag."""
! 250: self.insert(len(self.contents), tag)
! 251:
! 252: def findNext(self, name=None, attrs={}, text=None, **kwargs):
! 253: """Returns the first item that matches the given criteria and
! 254: appears after this Tag in the document."""
! 255: return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
! 256:
! 257: def findAllNext(self, name=None, attrs={}, text=None, limit=None,
! 258: **kwargs):
! 259: """Returns all items that match the given criteria and appear
! 260: after this Tag in the document."""
! 261: return self._findAll(name, attrs, text, limit, self.nextGenerator,
! 262: **kwargs)
! 263:
! 264: def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
! 265: """Returns the closest sibling to this Tag that matches the
! 266: given criteria and appears after this Tag in the document."""
! 267: return self._findOne(self.findNextSiblings, name, attrs, text,
! 268: **kwargs)
! 269:
! 270: def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
! 271: **kwargs):
! 272: """Returns the siblings of this Tag that match the given
! 273: criteria and appear after this Tag in the document."""
! 274: return self._findAll(name, attrs, text, limit,
! 275: self.nextSiblingGenerator, **kwargs)
! 276: fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
! 277:
! 278: def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
! 279: """Returns the first item that matches the given criteria and
! 280: appears before this Tag in the document."""
! 281: return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
! 282:
! 283: def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
! 284: **kwargs):
! 285: """Returns all items that match the given criteria and appear
! 286: before this Tag in the document."""
! 287: return self._findAll(name, attrs, text, limit, self.previousGenerator,
! 288: **kwargs)
! 289: fetchPrevious = findAllPrevious # Compatibility with pre-3.x
! 290:
! 291: def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
! 292: """Returns the closest sibling to this Tag that matches the
! 293: given criteria and appears before this Tag in the document."""
! 294: return self._findOne(self.findPreviousSiblings, name, attrs, text,
! 295: **kwargs)
! 296:
! 297: def findPreviousSiblings(self, name=None, attrs={}, text=None,
! 298: limit=None, **kwargs):
! 299: """Returns the siblings of this Tag that match the given
! 300: criteria and appear before this Tag in the document."""
! 301: return self._findAll(name, attrs, text, limit,
! 302: self.previousSiblingGenerator, **kwargs)
! 303: fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
! 304:
! 305: def findParent(self, name=None, attrs={}, **kwargs):
! 306: """Returns the closest parent of this Tag that matches the given
! 307: criteria."""
! 308: # NOTE: We can't use _findOne because findParents takes a different
! 309: # set of arguments.
! 310: r = None
! 311: l = self.findParents(name, attrs, 1)
! 312: if l:
! 313: r = l[0]
! 314: return r
! 315:
! 316: def findParents(self, name=None, attrs={}, limit=None, **kwargs):
! 317: """Returns the parents of this Tag that match the given
! 318: criteria."""
! 319:
! 320: return self._findAll(name, attrs, None, limit, self.parentGenerator,
! 321: **kwargs)
! 322: fetchParents = findParents # Compatibility with pre-3.x
! 323:
! 324: #These methods do the real heavy lifting.
! 325:
! 326: def _findOne(self, method, name, attrs, text, **kwargs):
! 327: r = None
! 328: l = method(name, attrs, text, 1, **kwargs)
! 329: if l:
! 330: r = l[0]
! 331: return r
! 332:
! 333: def _findAll(self, name, attrs, text, limit, generator, **kwargs):
! 334: "Iterates over a generator looking for things that match."
! 335:
! 336: if isinstance(name, SoupStrainer):
! 337: strainer = name
! 338: # (Possibly) special case some findAll*(...) searches
! 339: elif text is None and not limit and not attrs and not kwargs:
! 340: # findAll*(True)
! 341: if name is True:
! 342: return [element for element in generator()
! 343: if isinstance(element, Tag)]
! 344: # findAll*('tag-name')
! 345: elif isinstance(name, basestring):
! 346: return [element for element in generator()
! 347: if isinstance(element, Tag) and
! 348: element.name == name]
! 349: else:
! 350: strainer = SoupStrainer(name, attrs, text, **kwargs)
! 351: # Build a SoupStrainer
! 352: else:
! 353: strainer = SoupStrainer(name, attrs, text, **kwargs)
! 354: results = ResultSet(strainer)
! 355: g = generator()
! 356: while True:
! 357: try:
! 358: i = g.next()
! 359: except StopIteration:
! 360: break
! 361: if i:
! 362: found = strainer.search(i)
! 363: if found:
! 364: results.append(found)
! 365: if limit and len(results) >= limit:
! 366: break
! 367: return results
! 368:
! 369: #These Generators can be used to navigate starting from both
! 370: #NavigableStrings and Tags.
! 371: def nextGenerator(self):
! 372: i = self
! 373: while i is not None:
! 374: i = i.next
! 375: yield i
! 376:
! 377: def nextSiblingGenerator(self):
! 378: i = self
! 379: while i is not None:
! 380: i = i.nextSibling
! 381: yield i
! 382:
! 383: def previousGenerator(self):
! 384: i = self
! 385: while i is not None:
! 386: i = i.previous
! 387: yield i
! 388:
! 389: def previousSiblingGenerator(self):
! 390: i = self
! 391: while i is not None:
! 392: i = i.previousSibling
! 393: yield i
! 394:
! 395: def parentGenerator(self):
! 396: i = self
! 397: while i is not None:
! 398: i = i.parent
! 399: yield i
! 400:
! 401: # Utility methods
! 402: def substituteEncoding(self, str, encoding=None):
! 403: encoding = encoding or "utf-8"
! 404: return str.replace("%SOUP-ENCODING%", encoding)
! 405:
! 406: def toEncoding(self, s, encoding=None):
! 407: """Encodes an object to a string in some encoding, or to Unicode.
! 408: ."""
! 409: if isinstance(s, unicode):
! 410: if encoding:
! 411: s = s.encode(encoding)
! 412: elif isinstance(s, str):
! 413: if encoding:
! 414: s = s.encode(encoding)
! 415: else:
! 416: s = unicode(s)
! 417: else:
! 418: if encoding:
! 419: s = self.toEncoding(str(s), encoding)
! 420: else:
! 421: s = unicode(s)
! 422: return s
! 423:
! 424: class NavigableString(unicode, PageElement):
! 425:
! 426: def __new__(cls, value):
! 427: """Create a new NavigableString.
! 428:
! 429: When unpickling a NavigableString, this method is called with
! 430: the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
! 431: passed in to the superclass's __new__ or the superclass won't know
! 432: how to handle non-ASCII characters.
! 433: """
! 434: if isinstance(value, unicode):
! 435: return unicode.__new__(cls, value)
! 436: return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
! 437:
! 438: def __getnewargs__(self):
! 439: return (NavigableString.__str__(self),)
! 440:
! 441: def __getattr__(self, attr):
! 442: """text.string gives you text. This is for backwards
! 443: compatibility for Navigable*String, but for CData* it lets you
! 444: get the string without the CData wrapper."""
! 445: if attr == 'string':
! 446: return self
! 447: else:
! 448: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
! 449:
! 450: def __unicode__(self):
! 451: return str(self).decode(DEFAULT_OUTPUT_ENCODING)
! 452:
! 453: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 454: if encoding:
! 455: return self.encode(encoding)
! 456: else:
! 457: return self
! 458:
! 459: class CData(NavigableString):
! 460:
! 461: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 462: return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
! 463:
! 464: class ProcessingInstruction(NavigableString):
! 465: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 466: output = self
! 467: if "%SOUP-ENCODING%" in output:
! 468: output = self.substituteEncoding(output, encoding)
! 469: return "<?%s?>" % self.toEncoding(output, encoding)
! 470:
! 471: class Comment(NavigableString):
! 472: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 473: return "<!--%s-->" % NavigableString.__str__(self, encoding)
! 474:
! 475: class Declaration(NavigableString):
! 476: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 477: return "<!%s>" % NavigableString.__str__(self, encoding)
! 478:
! 479: class Tag(PageElement):
! 480:
! 481: """Represents a found HTML tag with its attributes and contents."""
! 482:
! 483: def _invert(h):
! 484: "Cheap function to invert a hash."
! 485: i = {}
! 486: for k,v in h.items():
! 487: i[v] = k
! 488: return i
! 489:
! 490: XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
! 491: "quot" : '"',
! 492: "amp" : "&",
! 493: "lt" : "<",
! 494: "gt" : ">" }
! 495:
! 496: XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
! 497:
! 498: def _convertEntities(self, match):
! 499: """Used in a call to re.sub to replace HTML, XML, and numeric
! 500: entities with the appropriate Unicode characters. If HTML
! 501: entities are being converted, any unrecognized entities are
! 502: escaped."""
! 503: x = match.group(1)
! 504: if self.convertHTMLEntities and x in name2codepoint:
! 505: return unichr(name2codepoint[x])
! 506: elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
! 507: if self.convertXMLEntities:
! 508: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
! 509: else:
! 510: return u'&%s;' % x
! 511: elif len(x) > 0 and x[0] == '#':
! 512: # Handle numeric entities
! 513: if len(x) > 1 and x[1] == 'x':
! 514: return unichr(int(x[2:], 16))
! 515: else:
! 516: return unichr(int(x[1:]))
! 517:
! 518: elif self.escapeUnrecognizedEntities:
! 519: return u'&%s;' % x
! 520: else:
! 521: return u'&%s;' % x
! 522:
! 523: def __init__(self, parser, name, attrs=None, parent=None,
! 524: previous=None):
! 525: "Basic constructor."
! 526:
! 527: # We don't actually store the parser object: that lets extracted
! 528: # chunks be garbage-collected
! 529: self.parserClass = parser.__class__
! 530: self.isSelfClosing = parser.isSelfClosingTag(name)
! 531: self.name = name
! 532: if attrs is None:
! 533: attrs = []
! 534: elif isinstance(attrs, dict):
! 535: attrs = attrs.items()
! 536: self.attrs = attrs
! 537: self.contents = []
! 538: self.setup(parent, previous)
! 539: self.hidden = False
! 540: self.containsSubstitutions = False
! 541: self.convertHTMLEntities = parser.convertHTMLEntities
! 542: self.convertXMLEntities = parser.convertXMLEntities
! 543: self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
! 544:
! 545: # Convert any HTML, XML, or numeric entities in the attribute values.
! 546: convert = lambda(k, val): (k,
! 547: re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
! 548: self._convertEntities,
! 549: val))
! 550: self.attrs = map(convert, self.attrs)
! 551:
! 552: def getString(self):
! 553: if (len(self.contents) == 1
! 554: and isinstance(self.contents[0], NavigableString)):
! 555: return self.contents[0]
! 556:
! 557: def setString(self, string):
! 558: """Replace the contents of the tag with a string"""
! 559: self.clear()
! 560: self.append(string)
! 561:
! 562: string = property(getString, setString)
! 563:
! 564: def getText(self, separator=u""):
! 565: if not len(self.contents):
! 566: return u""
! 567: stopNode = self._lastRecursiveChild().next
! 568: strings = []
! 569: current = self.contents[0]
! 570: while current is not stopNode:
! 571: if isinstance(current, NavigableString):
! 572: strings.append(current.strip())
! 573: current = current.next
! 574: return separator.join(strings)
! 575:
! 576: text = property(getText)
! 577:
! 578: def get(self, key, default=None):
! 579: """Returns the value of the 'key' attribute for the tag, or
! 580: the value given for 'default' if it doesn't have that
! 581: attribute."""
! 582: return self._getAttrMap().get(key, default)
! 583:
! 584: def clear(self):
! 585: """Extract all children."""
! 586: for child in self.contents[:]:
! 587: child.extract()
! 588:
! 589: def index(self, element):
! 590: for i, child in enumerate(self.contents):
! 591: if child is element:
! 592: return i
! 593: raise ValueError("Tag.index: element not in tag")
! 594:
! 595: def has_key(self, key):
! 596: return self._getAttrMap().has_key(key)
! 597:
! 598: def __getitem__(self, key):
! 599: """tag[key] returns the value of the 'key' attribute for the tag,
! 600: and throws an exception if it's not there."""
! 601: return self._getAttrMap()[key]
! 602:
! 603: def __iter__(self):
! 604: "Iterating over a tag iterates over its contents."
! 605: return iter(self.contents)
! 606:
! 607: def __len__(self):
! 608: "The length of a tag is the length of its list of contents."
! 609: return len(self.contents)
! 610:
! 611: def __contains__(self, x):
! 612: return x in self.contents
! 613:
! 614: def __nonzero__(self):
! 615: "A tag is non-None even if it has no contents."
! 616: return True
! 617:
! 618: def __setitem__(self, key, value):
! 619: """Setting tag[key] sets the value of the 'key' attribute for the
! 620: tag."""
! 621: self._getAttrMap()
! 622: self.attrMap[key] = value
! 623: found = False
! 624: for i in range(0, len(self.attrs)):
! 625: if self.attrs[i][0] == key:
! 626: self.attrs[i] = (key, value)
! 627: found = True
! 628: if not found:
! 629: self.attrs.append((key, value))
! 630: self._getAttrMap()[key] = value
! 631:
! 632: def __delitem__(self, key):
! 633: "Deleting tag[key] deletes all 'key' attributes for the tag."
! 634: for item in self.attrs:
! 635: if item[0] == key:
! 636: self.attrs.remove(item)
! 637: #We don't break because bad HTML can define the same
! 638: #attribute multiple times.
! 639: self._getAttrMap()
! 640: if self.attrMap.has_key(key):
! 641: del self.attrMap[key]
! 642:
! 643: def __call__(self, *args, **kwargs):
! 644: """Calling a tag like a function is the same as calling its
! 645: findAll() method. Eg. tag('a') returns a list of all the A tags
! 646: found within this tag."""
! 647: return apply(self.findAll, args, kwargs)
! 648:
! 649: def __getattr__(self, tag):
! 650: #print "Getattr %s.%s" % (self.__class__, tag)
! 651: if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
! 652: return self.find(tag[:-3])
! 653: elif tag.find('__') != 0:
! 654: return self.find(tag)
! 655: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
! 656:
! 657: def __eq__(self, other):
! 658: """Returns true iff this tag has the same name, the same attributes,
! 659: and the same contents (recursively) as the given tag.
! 660:
! 661: NOTE: right now this will return false if two tags have the
! 662: same attributes in a different order. Should this be fixed?"""
! 663: if other is self:
! 664: return True
! 665: if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
! 666: return False
! 667: for i in range(0, len(self.contents)):
! 668: if self.contents[i] != other.contents[i]:
! 669: return False
! 670: return True
! 671:
! 672: def __ne__(self, other):
! 673: """Returns true iff this tag is not identical to the other tag,
! 674: as defined in __eq__."""
! 675: return not self == other
! 676:
! 677: def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 678: """Renders this tag as a string."""
! 679: return self.__str__(encoding)
! 680:
! 681: def __unicode__(self):
! 682: return self.__str__(None)
! 683:
! 684: BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
! 685: + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
! 686: + ")")
! 687:
! 688: def _sub_entity(self, x):
! 689: """Used with a regular expression to substitute the
! 690: appropriate XML entity for an XML special character."""
! 691: return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
! 692:
! 693: def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
! 694: prettyPrint=False, indentLevel=0):
! 695: """Returns a string or Unicode representation of this tag and
! 696: its contents. To get Unicode, pass None for encoding.
! 697:
! 698: NOTE: since Python's HTML parser consumes whitespace, this
! 699: method is not certain to reproduce the whitespace present in
! 700: the original string."""
! 701:
! 702: encodedName = self.toEncoding(self.name, encoding)
! 703:
! 704: attrs = []
! 705: if self.attrs:
! 706: for key, val in self.attrs:
! 707: fmt = '%s="%s"'
! 708: if isinstance(val, basestring):
! 709: if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
! 710: val = self.substituteEncoding(val, encoding)
! 711:
! 712: # The attribute value either:
! 713: #
! 714: # * Contains no embedded double quotes or single quotes.
! 715: # No problem: we enclose it in double quotes.
! 716: # * Contains embedded single quotes. No problem:
! 717: # double quotes work here too.
! 718: # * Contains embedded double quotes. No problem:
! 719: # we enclose it in single quotes.
! 720: # * Embeds both single _and_ double quotes. This
! 721: # can't happen naturally, but it can happen if
! 722: # you modify an attribute value after parsing
! 723: # the document. Now we have a bit of a
! 724: # problem. We solve it by enclosing the
! 725: # attribute in single quotes, and escaping any
! 726: # embedded single quotes to XML entities.
! 727: if '"' in val:
! 728: fmt = "%s='%s'"
! 729: if "'" in val:
! 730: # TODO: replace with apos when
! 731: # appropriate.
! 732: val = val.replace("'", "&squot;")
! 733:
! 734: # Now we're okay w/r/t quotes. But the attribute
! 735: # value might also contain angle brackets, or
! 736: # ampersands that aren't part of entities. We need
! 737: # to escape those to XML entities too.
! 738: val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
! 739:
! 740: attrs.append(fmt % (self.toEncoding(key, encoding),
! 741: self.toEncoding(val, encoding)))
! 742: close = ''
! 743: closeTag = ''
! 744: if self.isSelfClosing:
! 745: close = ' /'
! 746: else:
! 747: closeTag = '</%s>' % encodedName
! 748:
! 749: indentTag, indentContents = 0, 0
! 750: if prettyPrint:
! 751: indentTag = indentLevel
! 752: space = (' ' * (indentTag-1))
! 753: indentContents = indentTag + 1
! 754: contents = self.renderContents(encoding, prettyPrint, indentContents)
! 755: if self.hidden:
! 756: s = contents
! 757: else:
! 758: s = []
! 759: attributeString = ''
! 760: if attrs:
! 761: attributeString = ' ' + ' '.join(attrs)
! 762: if prettyPrint:
! 763: s.append(space)
! 764: s.append('<%s%s%s>' % (encodedName, attributeString, close))
! 765: if prettyPrint:
! 766: s.append("\n")
! 767: s.append(contents)
! 768: if prettyPrint and contents and contents[-1] != "\n":
! 769: s.append("\n")
! 770: if prettyPrint and closeTag:
! 771: s.append(space)
! 772: s.append(closeTag)
! 773: if prettyPrint and closeTag and self.nextSibling:
! 774: s.append("\n")
! 775: s = ''.join(s)
! 776: return s
! 777:
! 778: def decompose(self):
! 779: """Recursively destroys the contents of this tree."""
! 780: self.extract()
! 781: if len(self.contents) == 0:
! 782: return
! 783: current = self.contents[0]
! 784: while current is not None:
! 785: next = current.next
! 786: if isinstance(current, Tag):
! 787: del current.contents[:]
! 788: current.parent = None
! 789: current.previous = None
! 790: current.previousSibling = None
! 791: current.next = None
! 792: current.nextSibling = None
! 793: current = next
! 794:
! 795: def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
! 796: return self.__str__(encoding, True)
! 797:
! 798: def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
! 799: prettyPrint=False, indentLevel=0):
! 800: """Renders the contents of this tag as a string in the given
! 801: encoding. If encoding is None, returns a Unicode string.."""
! 802: s=[]
! 803: for c in self:
! 804: text = None
! 805: if isinstance(c, NavigableString):
! 806: text = c.__str__(encoding)
! 807: elif isinstance(c, Tag):
! 808: s.append(c.__str__(encoding, prettyPrint, indentLevel))
! 809: if text and prettyPrint:
! 810: text = text.strip()
! 811: if text:
! 812: if prettyPrint:
! 813: s.append(" " * (indentLevel-1))
! 814: s.append(text)
! 815: if prettyPrint:
! 816: s.append("\n")
! 817: return ''.join(s)
! 818:
! 819: #Soup methods
! 820:
! 821: def find(self, name=None, attrs={}, recursive=True, text=None,
! 822: **kwargs):
! 823: """Return only the first child of this Tag matching the given
! 824: criteria."""
! 825: r = None
! 826: l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
! 827: if l:
! 828: r = l[0]
! 829: return r
! 830: findChild = find
! 831:
! 832: def findAll(self, name=None, attrs={}, recursive=True, text=None,
! 833: limit=None, **kwargs):
! 834: """Extracts a list of Tag objects that match the given
! 835: criteria. You can specify the name of the Tag and any
! 836: attributes you want the Tag to have.
! 837:
! 838: The value of a key-value pair in the 'attrs' map can be a
! 839: string, a list of strings, a regular expression object, or a
! 840: callable that takes a string and returns whether or not the
! 841: string matches for some custom definition of 'matches'. The
! 842: same is true of the tag name."""
! 843: generator = self.recursiveChildGenerator
! 844: if not recursive:
! 845: generator = self.childGenerator
! 846: return self._findAll(name, attrs, text, limit, generator, **kwargs)
! 847: findChildren = findAll
! 848:
! 849: # Pre-3.x compatibility methods
! 850: first = find
! 851: fetch = findAll
! 852:
! 853: def fetchText(self, text=None, recursive=True, limit=None):
! 854: return self.findAll(text=text, recursive=recursive, limit=limit)
! 855:
! 856: def firstText(self, text=None, recursive=True):
! 857: return self.find(text=text, recursive=recursive)
! 858:
! 859: #Private methods
! 860:
! 861: def _getAttrMap(self):
! 862: """Initializes a map representation of this tag's attributes,
! 863: if not already initialized."""
! 864: if not getattr(self, 'attrMap'):
! 865: self.attrMap = {}
! 866: for (key, value) in self.attrs:
! 867: self.attrMap[key] = value
! 868: return self.attrMap
! 869:
! 870: #Generator methods
! 871: def childGenerator(self):
! 872: # Just use the iterator from the contents
! 873: return iter(self.contents)
! 874:
! 875: def recursiveChildGenerator(self):
! 876: if not len(self.contents):
! 877: raise StopIteration
! 878: stopNode = self._lastRecursiveChild().next
! 879: current = self.contents[0]
! 880: while current is not stopNode:
! 881: yield current
! 882: current = current.next
! 883:
! 884:
! 885: # Next, a couple classes to represent queries and their results.
! 886: class SoupStrainer:
! 887: """Encapsulates a number of ways of matching a markup element (tag or
! 888: text)."""
! 889:
! 890: def __init__(self, name=None, attrs={}, text=None, **kwargs):
! 891: self.name = name
! 892: if isinstance(attrs, basestring):
! 893: kwargs['class'] = _match_css_class(attrs)
! 894: attrs = None
! 895: if kwargs:
! 896: if attrs:
! 897: attrs = attrs.copy()
! 898: attrs.update(kwargs)
! 899: else:
! 900: attrs = kwargs
! 901: self.attrs = attrs
! 902: self.text = text
! 903:
! 904: def __str__(self):
! 905: if self.text:
! 906: return self.text
! 907: else:
! 908: return "%s|%s" % (self.name, self.attrs)
! 909:
! 910: def searchTag(self, markupName=None, markupAttrs={}):
! 911: found = None
! 912: markup = None
! 913: if isinstance(markupName, Tag):
! 914: markup = markupName
! 915: markupAttrs = markup
! 916: callFunctionWithTagData = callable(self.name) \
! 917: and not isinstance(markupName, Tag)
! 918:
! 919: if (not self.name) \
! 920: or callFunctionWithTagData \
! 921: or (markup and self._matches(markup, self.name)) \
! 922: or (not markup and self._matches(markupName, self.name)):
! 923: if callFunctionWithTagData:
! 924: match = self.name(markupName, markupAttrs)
! 925: else:
! 926: match = True
! 927: markupAttrMap = None
! 928: for attr, matchAgainst in self.attrs.items():
! 929: if not markupAttrMap:
! 930: if hasattr(markupAttrs, 'get'):
! 931: markupAttrMap = markupAttrs
! 932: else:
! 933: markupAttrMap = {}
! 934: for k,v in markupAttrs:
! 935: markupAttrMap[k] = v
! 936: attrValue = markupAttrMap.get(attr)
! 937: if not self._matches(attrValue, matchAgainst):
! 938: match = False
! 939: break
! 940: if match:
! 941: if markup:
! 942: found = markup
! 943: else:
! 944: found = markupName
! 945: return found
! 946:
! 947: def search(self, markup):
! 948: #print 'looking for %s in %s' % (self, markup)
! 949: found = None
! 950: # If given a list of items, scan it for a text element that
! 951: # matches.
! 952: if hasattr(markup, "__iter__") \
! 953: and not isinstance(markup, Tag):
! 954: for element in markup:
! 955: if isinstance(element, NavigableString) \
! 956: and self.search(element):
! 957: found = element
! 958: break
! 959: # If it's a Tag, make sure its name or attributes match.
! 960: # Don't bother with Tags if we're searching for text.
! 961: elif isinstance(markup, Tag):
! 962: if not self.text:
! 963: found = self.searchTag(markup)
! 964: # If it's text, make sure the text matches.
! 965: elif isinstance(markup, NavigableString) or \
! 966: isinstance(markup, basestring):
! 967: if self._matches(markup, self.text):
! 968: found = markup
! 969: else:
! 970: raise Exception, "I don't know how to match against a %s" \
! 971: % markup.__class__
! 972: return found
! 973:
! 974: def _matches(self, markup, matchAgainst):
! 975: #print "Matching %s against %s" % (markup, matchAgainst)
! 976: result = False
! 977: if matchAgainst is True:
! 978: result = markup is not None
! 979: elif callable(matchAgainst):
! 980: result = matchAgainst(markup)
! 981: else:
! 982: #Custom match methods take the tag as an argument, but all
! 983: #other ways of matching match the tag name as a string.
! 984: if isinstance(markup, Tag):
! 985: markup = markup.name
! 986: if markup and not isinstance(markup, basestring):
! 987: markup = unicode(markup)
! 988: #Now we know that chunk is either a string, or None.
! 989: if hasattr(matchAgainst, 'match'):
! 990: # It's a regexp object.
! 991: result = markup and matchAgainst.search(markup)
! 992: elif hasattr(matchAgainst, '__iter__'): # list-like
! 993: result = markup in matchAgainst
! 994: elif hasattr(matchAgainst, 'items'):
! 995: result = markup.has_key(matchAgainst)
! 996: elif matchAgainst and isinstance(markup, basestring):
! 997: if isinstance(markup, unicode):
! 998: matchAgainst = unicode(matchAgainst)
! 999: else:
! 1000: matchAgainst = str(matchAgainst)
! 1001:
! 1002: if not result:
! 1003: result = matchAgainst == markup
! 1004: return result
! 1005:
! 1006: class ResultSet(list):
! 1007: """A ResultSet is just a list that keeps track of the SoupStrainer
! 1008: that created it."""
! 1009: def __init__(self, source):
! 1010: list.__init__([])
! 1011: self.source = source
! 1012:
! 1013: # Now, some helper functions.
! 1014:
! 1015: def buildTagMap(default, *args):
! 1016: """Turns a list of maps, lists, or scalars into a single map.
! 1017: Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
! 1018: NESTING_RESET_TAGS maps out of lists and partial maps."""
! 1019: built = {}
! 1020: for portion in args:
! 1021: if hasattr(portion, 'items'):
! 1022: #It's a map. Merge it.
! 1023: for k,v in portion.items():
! 1024: built[k] = v
! 1025: elif hasattr(portion, '__iter__'): # is a list
! 1026: #It's a list. Map each item to the default.
! 1027: for k in portion:
! 1028: built[k] = default
! 1029: else:
! 1030: #It's a scalar. Map it to the default.
! 1031: built[portion] = default
! 1032: return built
! 1033:
! 1034: # Now, the parser classes.
! 1035:
! 1036: class BeautifulStoneSoup(Tag, SGMLParser):
! 1037:
! 1038: """This class contains the basic parser and search code. It defines
! 1039: a parser that knows nothing about tag behavior except for the
! 1040: following:
! 1041:
! 1042: You can't close a tag without closing all the tags it encloses.
! 1043: That is, "<foo><bar></foo>" actually means
! 1044: "<foo><bar></bar></foo>".
! 1045:
! 1046: [Another possible explanation is "<foo><bar /></foo>", but since
! 1047: this class defines no SELF_CLOSING_TAGS, it will never use that
! 1048: explanation.]
! 1049:
! 1050: This class is useful for parsing XML or made-up markup languages,
! 1051: or when BeautifulSoup makes an assumption counter to what you were
! 1052: expecting."""
! 1053:
! 1054: SELF_CLOSING_TAGS = {}
! 1055: NESTABLE_TAGS = {}
! 1056: RESET_NESTING_TAGS = {}
! 1057: QUOTE_TAGS = {}
! 1058: PRESERVE_WHITESPACE_TAGS = []
! 1059:
! 1060: MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
! 1061: lambda x: x.group(1) + ' />'),
! 1062: (re.compile('<!\s+([^<>]*)>'),
! 1063: lambda x: '<!' + x.group(1) + '>')
! 1064: ]
! 1065:
! 1066: ROOT_TAG_NAME = u'[document]'
! 1067:
! 1068: HTML_ENTITIES = "html"
! 1069: XML_ENTITIES = "xml"
! 1070: XHTML_ENTITIES = "xhtml"
! 1071: # TODO: This only exists for backwards-compatibility
! 1072: ALL_ENTITIES = XHTML_ENTITIES
! 1073:
! 1074: # Used when determining whether a text node is all whitespace and
! 1075: # can be replaced with a single space. A text node that contains
! 1076: # fancy Unicode spaces (usually non-breaking) should be left
! 1077: # alone.
! 1078: STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
! 1079:
! 1080: def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
! 1081: markupMassage=True, smartQuotesTo=XML_ENTITIES,
! 1082: convertEntities=None, selfClosingTags=None, isHTML=False):
! 1083: """The Soup object is initialized as the 'root tag', and the
! 1084: provided markup (which can be a string or a file-like object)
! 1085: is fed into the underlying parser.
! 1086:
! 1087: sgmllib will process most bad HTML, and the BeautifulSoup
! 1088: class has some tricks for dealing with some HTML that kills
! 1089: sgmllib, but Beautiful Soup can nonetheless choke or lose data
! 1090: if your data uses self-closing tags or declarations
! 1091: incorrectly.
! 1092:
! 1093: By default, Beautiful Soup uses regexes to sanitize input,
! 1094: avoiding the vast majority of these problems. If the problems
! 1095: don't apply to you, pass in False for markupMassage, and
! 1096: you'll get better performance.
! 1097:
! 1098: The default parser massage techniques fix the two most common
! 1099: instances of invalid HTML that choke sgmllib:
! 1100:
! 1101: <br/> (No space between name of closing tag and tag close)
! 1102: <! --Comment--> (Extraneous whitespace in declaration)
! 1103:
! 1104: You can pass in a custom list of (RE object, replace method)
! 1105: tuples to get Beautiful Soup to scrub your input the way you
! 1106: want."""
! 1107:
! 1108: self.parseOnlyThese = parseOnlyThese
! 1109: self.fromEncoding = fromEncoding
! 1110: self.smartQuotesTo = smartQuotesTo
! 1111: self.convertEntities = convertEntities
! 1112: # Set the rules for how we'll deal with the entities we
! 1113: # encounter
! 1114: if self.convertEntities:
! 1115: # It doesn't make sense to convert encoded characters to
! 1116: # entities even while you're converting entities to Unicode.
! 1117: # Just convert it all to Unicode.
! 1118: self.smartQuotesTo = None
! 1119: if convertEntities == self.HTML_ENTITIES:
! 1120: self.convertXMLEntities = False
! 1121: self.convertHTMLEntities = True
! 1122: self.escapeUnrecognizedEntities = True
! 1123: elif convertEntities == self.XHTML_ENTITIES:
! 1124: self.convertXMLEntities = True
! 1125: self.convertHTMLEntities = True
! 1126: self.escapeUnrecognizedEntities = False
! 1127: elif convertEntities == self.XML_ENTITIES:
! 1128: self.convertXMLEntities = True
! 1129: self.convertHTMLEntities = False
! 1130: self.escapeUnrecognizedEntities = False
! 1131: else:
! 1132: self.convertXMLEntities = False
! 1133: self.convertHTMLEntities = False
! 1134: self.escapeUnrecognizedEntities = False
! 1135:
! 1136: self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
! 1137: SGMLParser.__init__(self)
! 1138:
! 1139: if hasattr(markup, 'read'): # It's a file-type object.
! 1140: markup = markup.read()
! 1141: self.markup = markup
! 1142: self.markupMassage = markupMassage
! 1143: try:
! 1144: self._feed(isHTML=isHTML)
! 1145: except StopParsing:
! 1146: pass
! 1147: self.markup = None # The markup can now be GCed
! 1148:
! 1149: def convert_charref(self, name):
! 1150: """This method fixes a bug in Python's SGMLParser."""
! 1151: try:
! 1152: n = int(name)
! 1153: except ValueError:
! 1154: return
! 1155: if not 0 <= n <= 127 : # ASCII ends at 127, not 255
! 1156: return
! 1157: return self.convert_codepoint(n)
! 1158:
! 1159: def _feed(self, inDocumentEncoding=None, isHTML=False):
! 1160: # Convert the document to Unicode.
! 1161: markup = self.markup
! 1162: if isinstance(markup, unicode):
! 1163: if not hasattr(self, 'originalEncoding'):
! 1164: self.originalEncoding = None
! 1165: else:
! 1166: dammit = UnicodeDammit\
! 1167: (markup, [self.fromEncoding, inDocumentEncoding],
! 1168: smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
! 1169: markup = dammit.unicode
! 1170: self.originalEncoding = dammit.originalEncoding
! 1171: self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
! 1172: if markup:
! 1173: if self.markupMassage:
! 1174: if not hasattr(self.markupMassage, "__iter__"):
! 1175: self.markupMassage = self.MARKUP_MASSAGE
! 1176: for fix, m in self.markupMassage:
! 1177: markup = fix.sub(m, markup)
! 1178: # TODO: We get rid of markupMassage so that the
! 1179: # soup object can be deepcopied later on. Some
! 1180: # Python installations can't copy regexes. If anyone
! 1181: # was relying on the existence of markupMassage, this
! 1182: # might cause problems.
! 1183: del(self.markupMassage)
! 1184: self.reset()
! 1185:
! 1186: SGMLParser.feed(self, markup)
! 1187: # Close out any unfinished strings and close all the open tags.
! 1188: self.endData()
! 1189: while self.currentTag.name != self.ROOT_TAG_NAME:
! 1190: self.popTag()
! 1191:
! 1192: def __getattr__(self, methodName):
! 1193: """This method routes method call requests to either the SGMLParser
! 1194: superclass or the Tag superclass, depending on the method name."""
! 1195: #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
! 1196:
! 1197: if methodName.startswith('start_') or methodName.startswith('end_') \
! 1198: or methodName.startswith('do_'):
! 1199: return SGMLParser.__getattr__(self, methodName)
! 1200: elif not methodName.startswith('__'):
! 1201: return Tag.__getattr__(self, methodName)
! 1202: else:
! 1203: raise AttributeError
! 1204:
! 1205: def isSelfClosingTag(self, name):
! 1206: """Returns true iff the given string is the name of a
! 1207: self-closing tag according to this parser."""
! 1208: return self.SELF_CLOSING_TAGS.has_key(name) \
! 1209: or self.instanceSelfClosingTags.has_key(name)
! 1210:
! 1211: def reset(self):
! 1212: Tag.__init__(self, self, self.ROOT_TAG_NAME)
! 1213: self.hidden = 1
! 1214: SGMLParser.reset(self)
! 1215: self.currentData = []
! 1216: self.currentTag = None
! 1217: self.tagStack = []
! 1218: self.quoteStack = []
! 1219: self.pushTag(self)
! 1220:
! 1221: def popTag(self):
! 1222: tag = self.tagStack.pop()
! 1223:
! 1224: #print "Pop", tag.name
! 1225: if self.tagStack:
! 1226: self.currentTag = self.tagStack[-1]
! 1227: return self.currentTag
! 1228:
! 1229: def pushTag(self, tag):
! 1230: #print "Push", tag.name
! 1231: if self.currentTag:
! 1232: self.currentTag.contents.append(tag)
! 1233: self.tagStack.append(tag)
! 1234: self.currentTag = self.tagStack[-1]
! 1235:
! 1236: def endData(self, containerClass=NavigableString):
! 1237: if self.currentData:
! 1238: currentData = u''.join(self.currentData)
! 1239: if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
! 1240: not set([tag.name for tag in self.tagStack]).intersection(
! 1241: self.PRESERVE_WHITESPACE_TAGS)):
! 1242: if '\n' in currentData:
! 1243: currentData = '\n'
! 1244: else:
! 1245: currentData = ' '
! 1246: self.currentData = []
! 1247: if self.parseOnlyThese and len(self.tagStack) <= 1 and \
! 1248: (not self.parseOnlyThese.text or \
! 1249: not self.parseOnlyThese.search(currentData)):
! 1250: return
! 1251: o = containerClass(currentData)
! 1252: o.setup(self.currentTag, self.previous)
! 1253: if self.previous:
! 1254: self.previous.next = o
! 1255: self.previous = o
! 1256: self.currentTag.contents.append(o)
! 1257:
! 1258:
! 1259: def _popToTag(self, name, inclusivePop=True):
! 1260: """Pops the tag stack up to and including the most recent
! 1261: instance of the given tag. If inclusivePop is false, pops the tag
! 1262: stack up to but *not* including the most recent instqance of
! 1263: the given tag."""
! 1264: #print "Popping to %s" % name
! 1265: if name == self.ROOT_TAG_NAME:
! 1266: return
! 1267:
! 1268: numPops = 0
! 1269: mostRecentTag = None
! 1270: for i in range(len(self.tagStack)-1, 0, -1):
! 1271: if name == self.tagStack[i].name:
! 1272: numPops = len(self.tagStack)-i
! 1273: break
! 1274: if not inclusivePop:
! 1275: numPops = numPops - 1
! 1276:
! 1277: for i in range(0, numPops):
! 1278: mostRecentTag = self.popTag()
! 1279: return mostRecentTag
! 1280:
! 1281: def _smartPop(self, name):
! 1282:
! 1283: """We need to pop up to the previous tag of this type, unless
! 1284: one of this tag's nesting reset triggers comes between this
! 1285: tag and the previous tag of this type, OR unless this tag is a
! 1286: generic nesting trigger and another generic nesting trigger
! 1287: comes between this tag and the previous tag of this type.
! 1288:
! 1289: Examples:
! 1290: <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
! 1291: <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
! 1292: <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
! 1293:
! 1294: <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
! 1295: <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
! 1296: <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
! 1297: """
! 1298:
! 1299: nestingResetTriggers = self.NESTABLE_TAGS.get(name)
! 1300: isNestable = nestingResetTriggers != None
! 1301: isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
! 1302: popTo = None
! 1303: inclusive = True
! 1304: for i in range(len(self.tagStack)-1, 0, -1):
! 1305: p = self.tagStack[i]
! 1306: if (not p or p.name == name) and not isNestable:
! 1307: #Non-nestable tags get popped to the top or to their
! 1308: #last occurance.
! 1309: popTo = name
! 1310: break
! 1311: if (nestingResetTriggers is not None
! 1312: and p.name in nestingResetTriggers) \
! 1313: or (nestingResetTriggers is None and isResetNesting
! 1314: and self.RESET_NESTING_TAGS.has_key(p.name)):
! 1315:
! 1316: #If we encounter one of the nesting reset triggers
! 1317: #peculiar to this tag, or we encounter another tag
! 1318: #that causes nesting to reset, pop up to but not
! 1319: #including that tag.
! 1320: popTo = p.name
! 1321: inclusive = False
! 1322: break
! 1323: p = p.parent
! 1324: if popTo:
! 1325: self._popToTag(popTo, inclusive)
! 1326:
! 1327: def unknown_starttag(self, name, attrs, selfClosing=0):
! 1328: #print "Start tag %s: %s" % (name, attrs)
! 1329: if self.quoteStack:
! 1330: #This is not a real tag.
! 1331: #print "<%s> is not real!" % name
! 1332: attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
! 1333: self.handle_data('<%s%s>' % (name, attrs))
! 1334: return
! 1335: self.endData()
! 1336:
! 1337: if not self.isSelfClosingTag(name) and not selfClosing:
! 1338: self._smartPop(name)
! 1339:
! 1340: if self.parseOnlyThese and len(self.tagStack) <= 1 \
! 1341: and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
! 1342: return
! 1343:
! 1344: tag = Tag(self, name, attrs, self.currentTag, self.previous)
! 1345: if self.previous:
! 1346: self.previous.next = tag
! 1347: self.previous = tag
! 1348: self.pushTag(tag)
! 1349: if selfClosing or self.isSelfClosingTag(name):
! 1350: self.popTag()
! 1351: if name in self.QUOTE_TAGS:
! 1352: #print "Beginning quote (%s)" % name
! 1353: self.quoteStack.append(name)
! 1354: self.literal = 1
! 1355: return tag
! 1356:
! 1357: def unknown_endtag(self, name):
! 1358: #print "End tag %s" % name
! 1359: if self.quoteStack and self.quoteStack[-1] != name:
! 1360: #This is not a real end tag.
! 1361: #print "</%s> is not real!" % name
! 1362: self.handle_data('</%s>' % name)
! 1363: return
! 1364: self.endData()
! 1365: self._popToTag(name)
! 1366: if self.quoteStack and self.quoteStack[-1] == name:
! 1367: self.quoteStack.pop()
! 1368: self.literal = (len(self.quoteStack) > 0)
! 1369:
! 1370: def handle_data(self, data):
! 1371: self.currentData.append(data)
! 1372:
! 1373: def _toStringSubclass(self, text, subclass):
! 1374: """Adds a certain piece of text to the tree as a NavigableString
! 1375: subclass."""
! 1376: self.endData()
! 1377: self.handle_data(text)
! 1378: self.endData(subclass)
! 1379:
! 1380: def handle_pi(self, text):
! 1381: """Handle a processing instruction as a ProcessingInstruction
! 1382: object, possibly one with a %SOUP-ENCODING% slot into which an
! 1383: encoding will be plugged later."""
! 1384: if text[:3] == "xml":
! 1385: text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
! 1386: self._toStringSubclass(text, ProcessingInstruction)
! 1387:
! 1388: def handle_comment(self, text):
! 1389: "Handle comments as Comment objects."
! 1390: self._toStringSubclass(text, Comment)
! 1391:
! 1392: def handle_charref(self, ref):
! 1393: "Handle character references as data."
! 1394: if self.convertEntities:
! 1395: data = unichr(int(ref))
! 1396: else:
! 1397: data = '&#%s;' % ref
! 1398: self.handle_data(data)
! 1399:
! 1400: def handle_entityref(self, ref):
! 1401: """Handle entity references as data, possibly converting known
! 1402: HTML and/or XML entity references to the corresponding Unicode
! 1403: characters."""
! 1404: data = None
! 1405: if self.convertHTMLEntities:
! 1406: try:
! 1407: data = unichr(name2codepoint[ref])
! 1408: except KeyError:
! 1409: pass
! 1410:
! 1411: if not data and self.convertXMLEntities:
! 1412: data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
! 1413:
! 1414: if not data and self.convertHTMLEntities and \
! 1415: not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
! 1416: # TODO: We've got a problem here. We're told this is
! 1417: # an entity reference, but it's not an XML entity
! 1418: # reference or an HTML entity reference. Nonetheless,
! 1419: # the logical thing to do is to pass it through as an
! 1420: # unrecognized entity reference.
! 1421: #
! 1422: # Except: when the input is "&carol;" this function
! 1423: # will be called with input "carol". When the input is
! 1424: # "AT&T", this function will be called with input
! 1425: # "T". We have no way of knowing whether a semicolon
! 1426: # was present originally, so we don't know whether
! 1427: # this is an unknown entity or just a misplaced
! 1428: # ampersand.
! 1429: #
! 1430: # The more common case is a misplaced ampersand, so I
! 1431: # escape the ampersand and omit the trailing semicolon.
! 1432: data = "&%s" % ref
! 1433: if not data:
! 1434: # This case is different from the one above, because we
! 1435: # haven't already gone through a supposedly comprehensive
! 1436: # mapping of entities to Unicode characters. We might not
! 1437: # have gone through any mapping at all. So the chances are
! 1438: # very high that this is a real entity, and not a
! 1439: # misplaced ampersand.
! 1440: data = "&%s;" % ref
! 1441: self.handle_data(data)
! 1442:
! 1443: def handle_decl(self, data):
! 1444: "Handle DOCTYPEs and the like as Declaration objects."
! 1445: self._toStringSubclass(data, Declaration)
! 1446:
! 1447: def parse_declaration(self, i):
! 1448: """Treat a bogus SGML declaration as raw data. Treat a CDATA
! 1449: declaration as a CData object."""
! 1450: j = None
! 1451: if self.rawdata[i:i+9] == '<![CDATA[':
! 1452: k = self.rawdata.find(']]>', i)
! 1453: if k == -1:
! 1454: k = len(self.rawdata)
! 1455: data = self.rawdata[i+9:k]
! 1456: j = k+3
! 1457: self._toStringSubclass(data, CData)
! 1458: else:
! 1459: try:
! 1460: j = SGMLParser.parse_declaration(self, i)
! 1461: except SGMLParseError:
! 1462: toHandle = self.rawdata[i:]
! 1463: self.handle_data(toHandle)
! 1464: j = i + len(toHandle)
! 1465: return j
! 1466:
! 1467: class BeautifulSoup(BeautifulStoneSoup):
! 1468:
! 1469: """This parser knows the following facts about HTML:
! 1470:
! 1471: * Some tags have no closing tag and should be interpreted as being
! 1472: closed as soon as they are encountered.
! 1473:
! 1474: * The text inside some tags (ie. 'script') may contain tags which
! 1475: are not really part of the document and which should be parsed
! 1476: as text, not tags. If you want to parse the text as tags, you can
! 1477: always fetch it and parse it explicitly.
! 1478:
! 1479: * Tag nesting rules:
! 1480:
! 1481: Most tags can't be nested at all. For instance, the occurance of
! 1482: a <p> tag should implicitly close the previous <p> tag.
! 1483:
! 1484: <p>Para1<p>Para2
! 1485: should be transformed into:
! 1486: <p>Para1</p><p>Para2
! 1487:
! 1488: Some tags can be nested arbitrarily. For instance, the occurance
! 1489: of a <blockquote> tag should _not_ implicitly close the previous
! 1490: <blockquote> tag.
! 1491:
! 1492: Alice said: <blockquote>Bob said: <blockquote>Blah
! 1493: should NOT be transformed into:
! 1494: Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
! 1495:
! 1496: Some tags can be nested, but the nesting is reset by the
! 1497: interposition of other tags. For instance, a <tr> tag should
! 1498: implicitly close the previous <tr> tag within the same <table>,
! 1499: but not close a <tr> tag in another table.
! 1500:
! 1501: <table><tr>Blah<tr>Blah
! 1502: should be transformed into:
! 1503: <table><tr>Blah</tr><tr>Blah
! 1504: but,
! 1505: <tr>Blah<table><tr>Blah
! 1506: should NOT be transformed into
! 1507: <tr>Blah<table></tr><tr>Blah
! 1508:
! 1509: Differing assumptions about tag nesting rules are a major source
! 1510: of problems with the BeautifulSoup class. If BeautifulSoup is not
! 1511: treating as nestable a tag your page author treats as nestable,
! 1512: try ICantBelieveItsBeautifulSoup, MinimalSoup, or
! 1513: BeautifulStoneSoup before writing your own subclass."""
! 1514:
! 1515: def __init__(self, *args, **kwargs):
! 1516: if not kwargs.has_key('smartQuotesTo'):
! 1517: kwargs['smartQuotesTo'] = self.HTML_ENTITIES
! 1518: kwargs['isHTML'] = True
! 1519: BeautifulStoneSoup.__init__(self, *args, **kwargs)
! 1520:
! 1521: SELF_CLOSING_TAGS = buildTagMap(None,
! 1522: ('br' , 'hr', 'input', 'img', 'meta',
! 1523: 'spacer', 'link', 'frame', 'base', 'col'))
! 1524:
! 1525: PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
! 1526:
! 1527: QUOTE_TAGS = {'script' : None, 'textarea' : None}
! 1528:
! 1529: #According to the HTML standard, each of these inline tags can
! 1530: #contain another tag of the same type. Furthermore, it's common
! 1531: #to actually use these tags this way.
! 1532: NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
! 1533: 'center')
! 1534:
! 1535: #According to the HTML standard, these block tags can contain
! 1536: #another tag of the same type. Furthermore, it's common
! 1537: #to actually use these tags this way.
! 1538: NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
! 1539:
! 1540: #Lists can contain other lists, but there are restrictions.
! 1541: NESTABLE_LIST_TAGS = { 'ol' : [],
! 1542: 'ul' : [],
! 1543: 'li' : ['ul', 'ol'],
! 1544: 'dl' : [],
! 1545: 'dd' : ['dl'],
! 1546: 'dt' : ['dl'] }
! 1547:
! 1548: #Tables can contain other tables, but there are restrictions.
! 1549: NESTABLE_TABLE_TAGS = {'table' : [],
! 1550: 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
! 1551: 'td' : ['tr'],
! 1552: 'th' : ['tr'],
! 1553: 'thead' : ['table'],
! 1554: 'tbody' : ['table'],
! 1555: 'tfoot' : ['table'],
! 1556: }
! 1557:
! 1558: NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
! 1559:
! 1560: #If one of these tags is encountered, all tags up to the next tag of
! 1561: #this type are popped.
! 1562: RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
! 1563: NON_NESTABLE_BLOCK_TAGS,
! 1564: NESTABLE_LIST_TAGS,
! 1565: NESTABLE_TABLE_TAGS)
! 1566:
! 1567: NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
! 1568: NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
! 1569:
! 1570: # Used to detect the charset in a META tag; see start_meta
! 1571: CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
! 1572:
! 1573: def start_meta(self, attrs):
! 1574: """Beautiful Soup can detect a charset included in a META tag,
! 1575: try to convert the document to that charset, and re-parse the
! 1576: document from the beginning."""
! 1577: httpEquiv = None
! 1578: contentType = None
! 1579: contentTypeIndex = None
! 1580: tagNeedsEncodingSubstitution = False
! 1581:
! 1582: for i in range(0, len(attrs)):
! 1583: key, value = attrs[i]
! 1584: key = key.lower()
! 1585: if key == 'http-equiv':
! 1586: httpEquiv = value
! 1587: elif key == 'content':
! 1588: contentType = value
! 1589: contentTypeIndex = i
! 1590:
! 1591: if httpEquiv and contentType: # It's an interesting meta tag.
! 1592: match = self.CHARSET_RE.search(contentType)
! 1593: if match:
! 1594: if (self.declaredHTMLEncoding is not None or
! 1595: self.originalEncoding == self.fromEncoding):
! 1596: # An HTML encoding was sniffed while converting
! 1597: # the document to Unicode, or an HTML encoding was
! 1598: # sniffed during a previous pass through the
! 1599: # document, or an encoding was specified
! 1600: # explicitly and it worked. Rewrite the meta tag.
! 1601: def rewrite(match):
! 1602: return match.group(1) + "%SOUP-ENCODING%"
! 1603: newAttr = self.CHARSET_RE.sub(rewrite, contentType)
! 1604: attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
! 1605: newAttr)
! 1606: tagNeedsEncodingSubstitution = True
! 1607: else:
! 1608: # This is our first pass through the document.
! 1609: # Go through it again with the encoding information.
! 1610: newCharset = match.group(3)
! 1611: if newCharset and newCharset != self.originalEncoding:
! 1612: self.declaredHTMLEncoding = newCharset
! 1613: self._feed(self.declaredHTMLEncoding)
! 1614: raise StopParsing
! 1615: pass
! 1616: tag = self.unknown_starttag("meta", attrs)
! 1617: if tag and tagNeedsEncodingSubstitution:
! 1618: tag.containsSubstitutions = True
! 1619:
! 1620: class StopParsing(Exception):
! 1621: pass
! 1622:
! 1623: class ICantBelieveItsBeautifulSoup(BeautifulSoup):
! 1624:
! 1625: """The BeautifulSoup class is oriented towards skipping over
! 1626: common HTML errors like unclosed tags. However, sometimes it makes
! 1627: errors of its own. For instance, consider this fragment:
! 1628:
! 1629: <b>Foo<b>Bar</b></b>
! 1630:
! 1631: This is perfectly valid (if bizarre) HTML. However, the
! 1632: BeautifulSoup class will implicitly close the first b tag when it
! 1633: encounters the second 'b'. It will think the author wrote
! 1634: "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
! 1635: there's no real-world reason to bold something that's already
! 1636: bold. When it encounters '</b></b>' it will close two more 'b'
! 1637: tags, for a grand total of three tags closed instead of two. This
! 1638: can throw off the rest of your document structure. The same is
! 1639: true of a number of other tags, listed below.
! 1640:
! 1641: It's much more common for someone to forget to close a 'b' tag
! 1642: than to actually use nested 'b' tags, and the BeautifulSoup class
! 1643: handles the common case. This class handles the not-co-common
! 1644: case: where you can't believe someone wrote what they did, but
! 1645: it's valid HTML and BeautifulSoup screwed up by assuming it
! 1646: wouldn't be."""
! 1647:
! 1648: I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
! 1649: ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
! 1650: 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
! 1651: 'big')
! 1652:
! 1653: I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
! 1654:
! 1655: NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
! 1656: I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
! 1657: I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
! 1658:
! 1659: class MinimalSoup(BeautifulSoup):
! 1660: """The MinimalSoup class is for parsing HTML that contains
! 1661: pathologically bad markup. It makes no assumptions about tag
! 1662: nesting, but it does know which tags are self-closing, that
! 1663: <script> tags contain Javascript and should not be parsed, that
! 1664: META tags may contain encoding information, and so on.
! 1665:
! 1666: This also makes it better for subclassing than BeautifulStoneSoup
! 1667: or BeautifulSoup."""
! 1668:
! 1669: RESET_NESTING_TAGS = buildTagMap('noscript')
! 1670: NESTABLE_TAGS = {}
! 1671:
! 1672: class BeautifulSOAP(BeautifulStoneSoup):
! 1673: """This class will push a tag with only a single string child into
! 1674: the tag's parent as an attribute. The attribute's name is the tag
! 1675: name, and the value is the string child. An example should give
! 1676: the flavor of the change:
! 1677:
! 1678: <foo><bar>baz</bar></foo>
! 1679: =>
! 1680: <foo bar="baz"><bar>baz</bar></foo>
! 1681:
! 1682: You can then access fooTag['bar'] instead of fooTag.barTag.string.
! 1683:
! 1684: This is, of course, useful for scraping structures that tend to
! 1685: use subelements instead of attributes, such as SOAP messages. Note
! 1686: that it modifies its input, so don't print the modified version
! 1687: out.
! 1688:
! 1689: I'm not sure how many people really want to use this class; let me
! 1690: know if you do. Mainly I like the name."""
! 1691:
! 1692: def popTag(self):
! 1693: if len(self.tagStack) > 1:
! 1694: tag = self.tagStack[-1]
! 1695: parent = self.tagStack[-2]
! 1696: parent._getAttrMap()
! 1697: if (isinstance(tag, Tag) and len(tag.contents) == 1 and
! 1698: isinstance(tag.contents[0], NavigableString) and
! 1699: not parent.attrMap.has_key(tag.name)):
! 1700: parent[tag.name] = tag.contents[0]
! 1701: BeautifulStoneSoup.popTag(self)
! 1702:
! 1703: #Enterprise class names! It has come to our attention that some people
! 1704: #think the names of the Beautiful Soup parser classes are too silly
! 1705: #and "unprofessional" for use in enterprise screen-scraping. We feel
! 1706: #your pain! For such-minded folk, the Beautiful Soup Consortium And
! 1707: #All-Night Kosher Bakery recommends renaming this file to
! 1708: #"RobustParser.py" (or, in cases of extreme enterprisiness,
! 1709: #"RobustParserBeanInterface.class") and using the following
! 1710: #enterprise-friendly class aliases:
! 1711: class RobustXMLParser(BeautifulStoneSoup):
! 1712: pass
! 1713: class RobustHTMLParser(BeautifulSoup):
! 1714: pass
! 1715: class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
! 1716: pass
! 1717: class RobustInsanelyWackAssHTMLParser(MinimalSoup):
! 1718: pass
! 1719: class SimplifyingSOAPParser(BeautifulSOAP):
! 1720: pass
! 1721:
! 1722: ######################################################
! 1723: #
! 1724: # Bonus library: Unicode, Dammit
! 1725: #
! 1726: # This class forces XML data into a standard format (usually to UTF-8
! 1727: # or Unicode). It is heavily based on code from Mark Pilgrim's
! 1728: # Universal Feed Parser. It does not rewrite the XML or HTML to
! 1729: # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
! 1730: # (XML) and BeautifulSoup.start_meta (HTML).
! 1731:
! 1732: # Autodetects character encodings.
! 1733: # Download from http://chardet.feedparser.org/
! 1734: try:
! 1735: import chardet
! 1736: # import chardet.constants
! 1737: # chardet.constants._debug = 1
! 1738: except ImportError:
! 1739: chardet = None
! 1740:
! 1741: # cjkcodecs and iconv_codec make Python know about more character encodings.
! 1742: # Both are available from http://cjkpython.i18n.org/
! 1743: # They're built in if you use Python 2.4.
! 1744: try:
! 1745: import cjkcodecs.aliases
! 1746: except ImportError:
! 1747: pass
! 1748: try:
! 1749: import iconv_codec
! 1750: except ImportError:
! 1751: pass
! 1752:
! 1753: class UnicodeDammit:
! 1754: """A class for detecting the encoding of a *ML document and
! 1755: converting it to a Unicode string. If the source encoding is
! 1756: windows-1252, can replace MS smart quotes with their HTML or XML
! 1757: equivalents."""
! 1758:
! 1759: # This dictionary maps commonly seen values for "charset" in HTML
! 1760: # meta tags to the corresponding Python codec names. It only covers
! 1761: # values that aren't in Python's aliases and can't be determined
! 1762: # by the heuristics in find_codec.
! 1763: CHARSET_ALIASES = { "macintosh" : "mac-roman",
! 1764: "x-sjis" : "shift-jis" }
! 1765:
! 1766: def __init__(self, markup, overrideEncodings=[],
! 1767: smartQuotesTo='xml', isHTML=False):
! 1768: self.declaredHTMLEncoding = None
! 1769: self.markup, documentEncoding, sniffedEncoding = \
! 1770: self._detectEncoding(markup, isHTML)
! 1771: self.smartQuotesTo = smartQuotesTo
! 1772: self.triedEncodings = []
! 1773: if markup == '' or isinstance(markup, unicode):
! 1774: self.originalEncoding = None
! 1775: self.unicode = unicode(markup)
! 1776: return
! 1777:
! 1778: u = None
! 1779: for proposedEncoding in overrideEncodings:
! 1780: u = self._convertFrom(proposedEncoding)
! 1781: if u: break
! 1782: if not u:
! 1783: for proposedEncoding in (documentEncoding, sniffedEncoding):
! 1784: u = self._convertFrom(proposedEncoding)
! 1785: if u: break
! 1786:
! 1787: # If no luck and we have auto-detection library, try that:
! 1788: if not u and chardet and not isinstance(self.markup, unicode):
! 1789: u = self._convertFrom(chardet.detect(self.markup)['encoding'])
! 1790:
! 1791: # As a last resort, try utf-8 and windows-1252:
! 1792: if not u:
! 1793: for proposed_encoding in ("utf-8", "windows-1252"):
! 1794: u = self._convertFrom(proposed_encoding)
! 1795: if u: break
! 1796:
! 1797: self.unicode = u
! 1798: if not u: self.originalEncoding = None
! 1799:
! 1800: def _subMSChar(self, orig):
! 1801: """Changes a MS smart quote character to an XML or HTML
! 1802: entity."""
! 1803: sub = self.MS_CHARS.get(orig)
! 1804: if isinstance(sub, tuple):
! 1805: if self.smartQuotesTo == 'xml':
! 1806: sub = '&#x%s;' % sub[1]
! 1807: else:
! 1808: sub = '&%s;' % sub[0]
! 1809: return sub
! 1810:
! 1811: def _convertFrom(self, proposed):
! 1812: proposed = self.find_codec(proposed)
! 1813: if not proposed or proposed in self.triedEncodings:
! 1814: return None
! 1815: self.triedEncodings.append(proposed)
! 1816: markup = self.markup
! 1817:
! 1818: # Convert smart quotes to HTML if coming from an encoding
! 1819: # that might have them.
! 1820: if self.smartQuotesTo and proposed.lower() in("windows-1252",
! 1821: "iso-8859-1",
! 1822: "iso-8859-2"):
! 1823: markup = re.compile("([\x80-\x9f])").sub \
! 1824: (lambda(x): self._subMSChar(x.group(1)),
! 1825: markup)
! 1826:
! 1827: try:
! 1828: # print "Trying to convert document to %s" % proposed
! 1829: u = self._toUnicode(markup, proposed)
! 1830: self.markup = u
! 1831: self.originalEncoding = proposed
! 1832: except Exception, e:
! 1833: # print "That didn't work!"
! 1834: # print e
! 1835: return None
! 1836: #print "Correct encoding: %s" % proposed
! 1837: return self.markup
! 1838:
! 1839: def _toUnicode(self, data, encoding):
! 1840: '''Given a string and its encoding, decodes the string into Unicode.
! 1841: %encoding is a string recognized by encodings.aliases'''
! 1842:
! 1843: # strip Byte Order Mark (if present)
! 1844: if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
! 1845: and (data[2:4] != '\x00\x00'):
! 1846: encoding = 'utf-16be'
! 1847: data = data[2:]
! 1848: elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
! 1849: and (data[2:4] != '\x00\x00'):
! 1850: encoding = 'utf-16le'
! 1851: data = data[2:]
! 1852: elif data[:3] == '\xef\xbb\xbf':
! 1853: encoding = 'utf-8'
! 1854: data = data[3:]
! 1855: elif data[:4] == '\x00\x00\xfe\xff':
! 1856: encoding = 'utf-32be'
! 1857: data = data[4:]
! 1858: elif data[:4] == '\xff\xfe\x00\x00':
! 1859: encoding = 'utf-32le'
! 1860: data = data[4:]
! 1861: newdata = unicode(data, encoding)
! 1862: return newdata
! 1863:
! 1864: def _detectEncoding(self, xml_data, isHTML=False):
! 1865: """Given a document, tries to detect its XML encoding."""
! 1866: xml_encoding = sniffed_xml_encoding = None
! 1867: try:
! 1868: if xml_data[:4] == '\x4c\x6f\xa7\x94':
! 1869: # EBCDIC
! 1870: xml_data = self._ebcdic_to_ascii(xml_data)
! 1871: elif xml_data[:4] == '\x00\x3c\x00\x3f':
! 1872: # UTF-16BE
! 1873: sniffed_xml_encoding = 'utf-16be'
! 1874: xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
! 1875: elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
! 1876: and (xml_data[2:4] != '\x00\x00'):
! 1877: # UTF-16BE with BOM
! 1878: sniffed_xml_encoding = 'utf-16be'
! 1879: xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
! 1880: elif xml_data[:4] == '\x3c\x00\x3f\x00':
! 1881: # UTF-16LE
! 1882: sniffed_xml_encoding = 'utf-16le'
! 1883: xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
! 1884: elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
! 1885: (xml_data[2:4] != '\x00\x00'):
! 1886: # UTF-16LE with BOM
! 1887: sniffed_xml_encoding = 'utf-16le'
! 1888: xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
! 1889: elif xml_data[:4] == '\x00\x00\x00\x3c':
! 1890: # UTF-32BE
! 1891: sniffed_xml_encoding = 'utf-32be'
! 1892: xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
! 1893: elif xml_data[:4] == '\x3c\x00\x00\x00':
! 1894: # UTF-32LE
! 1895: sniffed_xml_encoding = 'utf-32le'
! 1896: xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
! 1897: elif xml_data[:4] == '\x00\x00\xfe\xff':
! 1898: # UTF-32BE with BOM
! 1899: sniffed_xml_encoding = 'utf-32be'
! 1900: xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
! 1901: elif xml_data[:4] == '\xff\xfe\x00\x00':
! 1902: # UTF-32LE with BOM
! 1903: sniffed_xml_encoding = 'utf-32le'
! 1904: xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
! 1905: elif xml_data[:3] == '\xef\xbb\xbf':
! 1906: # UTF-8 with BOM
! 1907: sniffed_xml_encoding = 'utf-8'
! 1908: xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
! 1909: else:
! 1910: sniffed_xml_encoding = 'ascii'
! 1911: pass
! 1912: except:
! 1913: xml_encoding_match = None
! 1914: xml_encoding_match = re.compile(
! 1915: '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
! 1916: if not xml_encoding_match and isHTML:
! 1917: regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
! 1918: xml_encoding_match = regexp.search(xml_data)
! 1919: if xml_encoding_match is not None:
! 1920: xml_encoding = xml_encoding_match.groups()[0].lower()
! 1921: if isHTML:
! 1922: self.declaredHTMLEncoding = xml_encoding
! 1923: if sniffed_xml_encoding and \
! 1924: (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
! 1925: 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
! 1926: 'utf-16', 'utf-32', 'utf_16', 'utf_32',
! 1927: 'utf16', 'u16')):
! 1928: xml_encoding = sniffed_xml_encoding
! 1929: return xml_data, xml_encoding, sniffed_xml_encoding
! 1930:
! 1931:
! 1932: def find_codec(self, charset):
! 1933: return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
! 1934: or (charset and self._codec(charset.replace("-", ""))) \
! 1935: or (charset and self._codec(charset.replace("-", "_"))) \
! 1936: or charset
! 1937:
! 1938: def _codec(self, charset):
! 1939: if not charset: return charset
! 1940: codec = None
! 1941: try:
! 1942: codecs.lookup(charset)
! 1943: codec = charset
! 1944: except (LookupError, ValueError):
! 1945: pass
! 1946: return codec
! 1947:
! 1948: EBCDIC_TO_ASCII_MAP = None
! 1949: def _ebcdic_to_ascii(self, s):
! 1950: c = self.__class__
! 1951: if not c.EBCDIC_TO_ASCII_MAP:
! 1952: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
! 1953: 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
! 1954: 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
! 1955: 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
! 1956: 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
! 1957: 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
! 1958: 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
! 1959: 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
! 1960: 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
! 1961: 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
! 1962: 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
! 1963: 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
! 1964: 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
! 1965: 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
! 1966: 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
! 1967: 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
! 1968: 250,251,252,253,254,255)
! 1969: import string
! 1970: c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
! 1971: ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
! 1972: return s.translate(c.EBCDIC_TO_ASCII_MAP)
! 1973:
! 1974: MS_CHARS = { '\x80' : ('euro', '20AC'),
! 1975: '\x81' : ' ',
! 1976: '\x82' : ('sbquo', '201A'),
! 1977: '\x83' : ('fnof', '192'),
! 1978: '\x84' : ('bdquo', '201E'),
! 1979: '\x85' : ('hellip', '2026'),
! 1980: '\x86' : ('dagger', '2020'),
! 1981: '\x87' : ('Dagger', '2021'),
! 1982: '\x88' : ('circ', '2C6'),
! 1983: '\x89' : ('permil', '2030'),
! 1984: '\x8A' : ('Scaron', '160'),
! 1985: '\x8B' : ('lsaquo', '2039'),
! 1986: '\x8C' : ('OElig', '152'),
! 1987: '\x8D' : '?',
! 1988: '\x8E' : ('#x17D', '17D'),
! 1989: '\x8F' : '?',
! 1990: '\x90' : '?',
! 1991: '\x91' : ('lsquo', '2018'),
! 1992: '\x92' : ('rsquo', '2019'),
! 1993: '\x93' : ('ldquo', '201C'),
! 1994: '\x94' : ('rdquo', '201D'),
! 1995: '\x95' : ('bull', '2022'),
! 1996: '\x96' : ('ndash', '2013'),
! 1997: '\x97' : ('mdash', '2014'),
! 1998: '\x98' : ('tilde', '2DC'),
! 1999: '\x99' : ('trade', '2122'),
! 2000: '\x9a' : ('scaron', '161'),
! 2001: '\x9b' : ('rsaquo', '203A'),
! 2002: '\x9c' : ('oelig', '153'),
! 2003: '\x9d' : '?',
! 2004: '\x9e' : ('#x17E', '17E'),
! 2005: '\x9f' : ('Yuml', ''),}
! 2006:
! 2007: #######################################################################
! 2008:
! 2009:
! 2010: #By default, act as an HTML pretty-printer.
! 2011: if __name__ == '__main__':
! 2012: import sys
! 2013: soup = BeautifulSoup(sys.stdin)
! 2014: print soup.prettify()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>