ó
f#ñ]c           @   sµ   d  Z  d d l Z d d l Z d d l Z d d l m Z e j e ƒ Z	 i  Z
 d „  Z e ƒ  d e f d „  ƒ  YZ e j d ƒ Z d „  Z d	 „  Z d
 „  Z d „  Z d „  Z d S(   sÕ  Support for regular expressions conformant to the XML Schema specification.

For the most part, XML regular expressions are similar to the POSIX
ones, and can be handled by the Python C{re} module.  The exceptions
are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or
C{\p{IPAExtensions}}) and the character set subtraction capability.
This module supports those by scanning the regular expression,
replacing the category escapes with equivalent charset expressions.
It further detects the subtraction syntax and modifies the charset
expression to remove the unwanted code points.

The basic technique is to step through the characters of the regular
expression, entering a recursive-descent parser when one of the
translated constructs is encountered.

There is a nice set of XML regular expressions at
U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd},
with a sample document at U{
http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}iÿÿÿÿN(   t   sixc          C   sy  t  j i t j j j t j d ƒ 6ƒ t j d ƒ }  x= t j	 t j j j
 ƒ D]# \ } } | t  |  t j | ƒ <qN Wx= t j	 t j j j ƒ D]# \ } } | t  |  t j | ƒ <qŽ Wx= t j	 t j j j ƒ D]# \ } } | t  |  t j | ƒ <qÎ Wx= t j	 t j j j ƒ D]# \ } } | t  |  t j | ƒ <qWx= t j	 t j j j ƒ D]# \ } } | t  |  t j | ƒ <qNWd S(   sP   Set the values in _AllEsc without introducing C{k} and C{v} into
    the module.t   .i\   N(   t   _AllEsct   updatet   pyxbt   utilst   unicodet   WildcardEscR    t   ut   unichrt	   iteritemst   SingleCharEsct	   text_typet   MultiCharEsct   catEsct   complEsct
   IsBlockEsc(   t   bst   kt   v(    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyt   _InitializeAllEsc0   s    &%%%%%t   RegularExpressionErrorc           B   s   e  Z d  Z d „  Z RS(   s6   Raised when a regular expression cannot be processed..c         C   s'   | |  _  t j |  d | | f ƒ d  S(   Ns	   At %d: %s(   t   positiont
   ValueErrort   __init__(   t   selfR   t   description(    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyR   D   s    	(   t   __name__t
   __module__t   __doc__R   (    (    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyR   B   s   sF   \\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))c         C   sì   t  j |  | ƒ } | rÎ | j d ƒ } t j | ƒ } | d	 k	 rR | | j ƒ  f S| j d ƒ } | d	 k	 rµ | j d ƒ rœ t | d | d | f ƒ ‚ n  t | d | f ƒ ‚ n  t | d | f ƒ ‚ n  t | d |  | f ƒ ‚ d	 S(
   sO  Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term.

    This is one of:

      - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>},
      an escaped single character such as C{E{\}n}

      - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>},
      an escape code that can match a range of characters,
      e.g. C{E{\}s} to match certain whitespace characters

      - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the
      C{E{\}pE{lb}...E{rb}} Unicode property escapes including
      categories and blocks

      - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>},
      the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes

    If the parsing fails, throws a RegularExpressionError.

    @return: A pair C{(cps, p)} where C{cps} is a
    L{pyxb.utils.unicode.CodePointSet} containing the code points
    associated with the character class, and C{p} is the text offset
    immediately following the escape sequence.

    @raise RegularExpressionError: if the expression is syntactically
    invalid.
    i    t   charPropt   Iss#   Unrecognized Unicode block %s in %si   s"   Unrecognized character property %ss   Unrecognized character class %ss$   Unrecognized escape identifier at %sN(	   t   _CharClassEsc_ret   matcht   groupR   t   gett   Nonet   endt
   startswithR   (   t   textR   t   mot   escape_codet   cpst	   char_prop(    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyt   _MatchCharClassEscI   s     c         C   s}  | } d d d „  ƒ  Y} | ƒ  } g  } t  } xUt r…| t |  ƒ k r[ t | d ƒ ‚ n  |  | } | t j d ƒ k rÁ | s‘ | d | k	 r£ t | d ƒ ‚ n  t } | j ƒ  | d } Pq1 | t j d ƒ k rÚ Pq1 | t j d	 ƒ k r<t |  | ƒ \ } } | j	 ƒ  }	 |	 d k	 r,| j |	 ƒ q‚| j | ƒ q1 | t j d
 ƒ k rk| j | ƒ | d } q1 | j | ƒ | d } q1 W| sžt | d ƒ ‚ n  | d | k rÄt j d
 ƒ | d <n  | d | k rêt j d
 ƒ | d <n  t j j j ƒ  }
 d } xk| t | ƒ k  ro| | } | d t | ƒ k  r| | d | k r| | d } t | t j ƒ st | t j ƒ rÅ| | k s—| | k r©t | d ƒ ‚ n  t | d | | f ƒ ‚ n  | | k rãt | d ƒ ‚ n  |
 j t | ƒ t | ƒ f ƒ | d } q| | k r-t | d ƒ ‚ n5 t | t j ƒ rU|
 j t | ƒ ƒ n |
 j | ƒ | d } qW|
 | | f S(   sZ  Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term.

    @return: A tuple C{(cps, fs, p)} where:
      - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group;
      - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub;
      - C{p} is the text offset immediately following the closing brace.

    @raise RegularExpressionError: if the expression is syntactically
    invalid.
    t	   DashClassc           B   s   e  Z RS(    (   R   R   (    (    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyR-   …   s   s:   Incomplete character class expression, missing closing ']'t   [iÿÿÿÿs,   '[' character not allowed in character classi   t   ]i\   t   -s!   Empty character class not allowedi    i   sF   Two dashes in a row is not allowed in the middle of a character class.sK   Dashes must be surrounded by characters, not character class escapes. %r %rs5   Character ranges must have the lowest character firsti   s!   Dash without an initial character(    N(   t   Falset   Truet   lenR   R    R   t   popR	   R,   t   asSingleCharacterR$   t   appendR   R   R   t   CodePointSett
   isinstanceR   t   addt   ordt   extend(   R'   R   t   start_positionR-   t   DASHt   tokenst   has_following_subtractiont   chR*   t   single_chart
   result_cpst	   cur_tokent   startR%   (    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyt   _MatchPosCharGroupu   sp    		



*&c         C   sš  | t  |  ƒ k r$ t | d ƒ ‚ n  t j d ƒ |  | k rZ t | d |  | f ƒ ‚ n  | d } | t  |  ƒ k rˆ t | d ƒ ‚ n  |  | d k } | r« | d } n  t |  | ƒ \ } } } | rØ | j ƒ  } n  | rO|  | t j d ƒ k sý t ‚ |  | d t j d ƒ k s t ‚ | d } t |  | ƒ \ } } | j | ƒ n  | t  |  ƒ k sz|  | t j d ƒ k rŒt | d ƒ ‚ n  | | d f S(	   sÌ  Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.

    These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}.

    @param text: The complete text of the regular expression being
    translated.  The first character must be the C{[} starting a
    character class.

    @param position: The offset of the start of the character group.

    @return: A pair C{(cps, p)} where C{cps} is a
    L{pyxb.utils.unicode.CodePointSet} containing the code points
    associated with the property, and C{p} is the text offset
    immediately following the closing brace.

    @raise RegularExpressionError: if the expression is syntactically
    invalid.
    s"   Missing character class expressionR.   s6   Expected start of character class expression, got '%s'i   t   ^R0   R/   s#   Expected ']' to end character class(	   R3   R   R    R   RE   t   negatet   AssertionErrort   _MatchCharClassExprt   subtract(   R'   R   t   negatedRB   R?   t   sub_cps(    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyRI   Î   s,    
#
+c         C   s   | t  |  ƒ k r d S|  | } | d } d | k rI t j j j | f Sd | k rb t |  | ƒ Sd | k r{ t |  | ƒ Sd S(   s>  Attempt to match a U{character class expression
    <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.

    @param text: The complete text of the regular expression being
    translated

    @param position: The offset of the start of the potential
    expression.

    @return: C{None} if C{position} does not begin a character class
    expression; otherwise a pair C{(cps, p)} where C{cps} is a
    L{pyxb.utils.unicode.CodePointSet} containing the code points associated with
    the property, and C{p} is the text offset immediately following
    the closing brace.i   R   R.   s   \N(   R3   R$   R   R   R   R   RI   R,   (   R'   R   t   ct   np(    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyt   MaybeMatchCharacterClassü   s    

c         C   s	  t  |  t j ƒ s t ‚ g  } | j d ƒ d } x» | t |  ƒ k  rî t |  | ƒ } | d	 k rÌ |  | } | t j d ƒ k s• | t j d ƒ k r² | j t j	 d ƒ | ƒ n | j | ƒ | d 7} q4 | \ } } | j | j
 ƒ  ƒ q4 W| j d ƒ d j | ƒ S(
   sm  Convert the given pattern to the format required for Python
    regular expressions.

    @param pattern: A Unicode string defining a pattern consistent
    with U{XML regular
    expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.

    @return: A Unicode string specifying a Python regular expression
    that matches the same language as C{pattern}.s   ^(i    RF   t   $i\   i   s   )$t    N(   R8   R    R   RH   R6   R3   RO   R$   R   R	   t	   asPatternt   join(   t   patternt   new_pattern_eltsR   t   cgR@   R*   (    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyt   XMLToPython  s     

*(   R   t   ret   loggingt   pyxb.utils.unicodeR   t
   pyxb.utilsR    t	   getLoggerR   t   _logR   R   R   R   t   compileR    R,   RE   RI   RO   RW   (    (    (    sE   /data/av2000/b2b/venv/lib/python2.7/site-packages/pyxb/utils/xmlre.pyt   <module>#   s   		,	Y	.	