ó
n€
\c           @   sÑ  d  Z  d d l Z d d l Z d d l m Z d d l m Z d d l m	 Z	 d d l
 m Z m Z m Z y
 e Z Wn e k
 r“ e e f Z n Xy d d l m Z Wn! e k
 rË d d l m Z n Xy d d l m Z Wn! e k
 rd d l m Z n Xd	 e f d
 „  ƒ  YZ y d d l m Z Wn e k
 rAn  Xd e f d „  ƒ  YZ e ƒ  Z d „  Z d d d „ Z e d d d „ Z  e d d d „ Z! d d d „ Z" d d d „ Z# d „  Z$ e ƒ  Z% d S(   s?   
An interface to html5lib that mimics the lxml.html interface.
iÿÿÿÿN(   t
   HTMLParser(   t   TreeBuilder(   t   etree(   t   Elementt   XHTML_NAMESPACEt   _contains_block_level_tag(   t   urlopen(   t   urlparseR    c           B   s   e  Z d  Z e d „ Z RS(   s*   An html5lib HTML parser with lxml as tree.c         K   s    t  j |  d | d t | d  S(   Nt   strictt   tree(   t   _HTMLParsert   __init__R   (   t   selfR   t   kwargs(    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyR      s    (   t   __name__t
   __module__t   __doc__t   FalseR   (    (    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyR       s   (   t   XHTMLParserR   c           B   s   e  Z d  Z e d „ Z RS(   s+   An html5lib XHTML Parser with lxml as tree.c         K   s    t  j |  d | d t | d  S(   NR   R	   (   t   _XHTMLParserR   R   (   R   R   R   (    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyR   *   s    (   R   R   R   R   R   (    (    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyR   '   s   c         C   s6   |  j  | ƒ } | d  k	 r | S|  j  d t | f ƒ S(   Ns   {%s}%s(   t   findt   NoneR   (   R	   t   tagt   elem(    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyt	   _find_tag0   s    c         C   sŒ   t  |  t ƒ s t d ƒ ‚ n  | d k r3 t } n  i  } | d k r] t  |  t ƒ r] t } n  | d k	 rv | | d <n  | j |  |  j ƒ  S(   sÍ   
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    s   string requiredt
   useChardetN(	   t
   isinstancet   _stringst	   TypeErrorR   t   html_parsert   bytest   Truet   parset   getroot(   t   htmlt   guess_charsett   parsert   options(    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyt   document_fromstring7   s    		c         C   så   t  |  t ƒ s t d ƒ ‚ n  | d k r3 t } n  i  } | d k r] t  |  t ƒ r] t } n  | d k	 rv | | d <n  | j |  d |  } | rá t  | d t ƒ rá | rá | d j ƒ  rÔ t	 j
 d | d ƒ ‚ n  | d =qá n  | S(   s`  Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    s   string requiredR   t   divi    s   There is leading text: %rN(   R   R   R   R   R   R   R   t   parseFragmentt   stripR   t   ParserError(   R"   t   no_leading_textR#   R$   R%   t   children(    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyt   fragments_fromstringO   s"    			c         C   s;  t  |  t ƒ s t d ƒ ‚ n  t | ƒ } t |  d | d | d | ƒ} | r· t  | t ƒ sg d } n  t | ƒ } | r³ t  | d t ƒ r£ | d | _ | d =n  | j | ƒ n  | S| sÏ t j	 d ƒ ‚ n  t
 | ƒ d k ró t j	 d	 ƒ ‚ n  | d } | j r.| j j ƒ  r.t j	 d
 | j ƒ ‚ n  d | _ | S(   sÂ  Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    s   string requiredR#   R$   R+   R'   i    s   No elements foundi   s   Multiple elements founds   Element followed by text: %rN(   R   R   R   t   boolR-   R   t   textt   extendR   R*   t   lent   tailR)   R   (   R"   t   create_parentR#   R$   t   accept_leading_textt   elementst   new_roott   result(    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyt   fragment_fromstringq   s2    
	

	c         C   sA  t  |  t ƒ s t d ƒ ‚ n  t |  d | d | ƒ} |  d  } t  | t ƒ rd | j d d ƒ } n  | j ƒ  j ƒ  } | j d ƒ s” | j d ƒ r˜ | St	 | d	 ƒ } t
 | ƒ r· | St	 | d
 ƒ } t
 | ƒ d k r| j sò | j j ƒ  r| d j s| d j j ƒ  r| d St | ƒ r4d | _ n	 d | _ | S(   s   Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    s   string requiredR$   R#   i2   t   asciit   replaces   <htmls	   <!doctypet   headt   bodyi   iÿÿÿÿi    R'   t   span(   R   R   R   R&   R   t   decodet   lstript   lowert
   startswithR   R1   R/   R)   R2   R   R   (   R"   R#   R$   t   doct   startR;   R<   (    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyt
   fromstring   s*    	
,"	c         C   s¿   | d k r t } n  t |  t ƒ sB |  } | d k r– t } q– nT t |  ƒ rr t |  ƒ } | d k r– t } q– n$ t |  d ƒ } | d k r– t } n  i  } | r¯ | | d <n  | j	 | |  S(   s*  Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    t   rbR   N(
   R   R   R   R   R   t   _looks_like_urlR   R   t   openR    (   t   filename_url_or_fileR#   R$   t   fpR%   (    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyR    Ó   s"    		c         C   sV   t  |  ƒ d } | s t St j d k rN | t j k rN t | ƒ d k rN t St Sd  S(   Ni    t   win32i   (   R   R   t   syst   platformt   stringt   ascii_lettersR1   R   (   t   strt   scheme(    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyRF   ÷   s    (&   R   RK   RM   t   html5libR    R
   t    html5lib.treebuilders.etree_lxmlR   t   lxmlR   t	   lxml.htmlR   R   R   t
   basestringR   t	   NameErrorR   RO   t   urllib2R   t   ImportErrort   urllib.requestR   t   urllib.parseR   R   t   xhtml_parserR   R   R&   R   R-   R8   RD   R    RF   R   (    (    (    sJ   /data/av2000/b2b/venv/lib/python2.7/site-packages/lxml/html/html5parser.pyt   <module>   sF   
		!+6$	