Package pyxmpp :: Module xmlextra
[hide private]

Source Code for Module pyxmpp.xmlextra

  1  # 
  2  # (C) Copyright 2003-2010 Jacek Konieczny <jajcus@jajcus.net> 
  3  # 
  4  # This program is free software; you can redistribute it and/or modify 
  5  # it under the terms of the GNU Lesser General Public License Version 
  6  # 2.1 as published by the Free Software Foundation. 
  7  # 
  8  # This program is distributed in the hope that it will be useful, 
  9  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 10  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 11  # GNU Lesser General Public License for more details. 
 12  # 
 13  # You should have received a copy of the GNU Lesser General Public 
 14  # License along with this program; if not, write to the Free Software 
 15  # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 16  # 
 17  # pylint: disable-msg=C0103, W0132, W0611 
 18   
 19  """Extension to libxml2 for XMPP stream and stanza processing""" 
 20   
 21  __revision__="$Id: xmlextra.py,v 1.15 2004/10/11 18:33:51 jajcus Exp $" 
 22  __docformat__="restructuredtext en" 
 23   
 24  import sys 
 25  import libxml2 
 26  import threading 
 27  import re 
 28   
 29  from pyxmpp.exceptions import StreamParseError 
 30   
 31  common_doc = libxml2.newDoc("1.0") 
 32  common_root = common_doc.newChild(None,"root",None) 
 33  COMMON_NS = "http://pyxmpp.jajcus.net/xmlns/common" 
 34  common_ns = common_root.newNs(COMMON_NS, None) 
 35  common_root.setNs(common_ns) 
 36  common_doc.setRootElement(common_root) 
 37   
38 -class StreamHandler:
39 """Base class for stream handler."""
40 - def __init__(self):
41 pass
42
43 - def _stream_start(self,_doc):
44 """Process stream start.""" 45 doc=libxml2.xmlDoc(_doc) 46 self.stream_start(doc)
47
48 - def _stream_end(self,_doc):
49 """Process stream end.""" 50 doc=libxml2.xmlDoc(_doc) 51 self.stream_end(doc)
52
53 - def _stanza(self,_doc,_node):
54 """Process complete stanza.""" 55 doc=libxml2.xmlDoc(_doc) 56 node=libxml2.xmlNode(_node) 57 self.stanza(doc,node)
58
59 - def stream_start(self,doc):
60 """Called when the start tag of root element is encountered 61 in the stream. 62 63 :Parameters: 64 - `doc`: the document being parsed. 65 :Types: 66 - `doc`: `libxml2.xmlDoc`""" 67 print >>sys.stderr,"Unhandled stream start:",`doc.serialize()`
68
69 - def stream_end(self,doc):
70 """Called when the end tag of root element is encountered 71 in the stream. 72 73 :Parameters: 74 - `doc`: the document being parsed. 75 :Types: 76 - `doc`: `libxml2.xmlDoc`""" 77 print >>sys.stderr,"Unhandled stream end",`doc.serialize()`
78
79 - def stanza(self, _unused, node):
80 """Called when the end tag of a direct child of the root 81 element is encountered in the stream. 82 83 Please note, that node will be removed from the document 84 and freed after this method returns. If it is needed after 85 that a copy must be made before the method returns. 86 87 :Parameters: 88 - `_unused`: the document being parsed. 89 - `node`: the (complete) element being processed 90 :Types: 91 - `_unused`: `libxml2.xmlDoc` 92 - `node`: `libxml2.xmlNode`""" 93 print >>sys.stderr,"Unhandled stanza",`node.serialize()`
94
95 - def error(self,descr):
96 """Called when an error is encountered in the stream. 97 98 :Parameters: 99 - `descr`: description of the error 100 :Types: 101 - `descr`: `str`""" 102 raise StreamParseError,descr
103
104 - def warning(self,desc):
105 """Called when an warning is encountered in the stream. 106 107 :Parameters: 108 - `descr`: description of the warning 109 :Types: 110 - `descr`: `str`""" 111 # we know vcard-temp is bad... 112 if not desc.startswith('xmlns: URI vcard-temp is not absolute'): 113 print "XML STREAM WARNING:",desc
114 115 try: 116 ######################################################################### 117 # C-extension based workarounds for libxml2 limitations 118 #------------------------------------------------------- 119 from pyxmpp import _xmlextra 120 from pyxmpp._xmlextra import error 121 122 _create_reader = _xmlextra.sax_reader_new 123
124 - def replace_ns(node, old_ns,new_ns):
125 """Replace namespaces in a whole subtree. 126 127 The old namespace declaration will be removed if present on the `node`. 128 129 :Parameters: 130 - `node`: the root of the subtree where namespaces should be replaced. 131 - `old_ns`: the namespace to replace. 132 - `new_ns`: the namespace to be used instead of old_ns. 133 :Types: 134 - `node`: `libxml2.xmlNode` 135 - `old_ns`: `libxml2.xmlNs` 136 - `new_ns`: `libxml2.xmlNs` 137 138 Both old_ns and new_ns may be None meaning no namespace set.""" 139 if old_ns is None: 140 old_ns__o = None 141 else: 142 old_ns__o = old_ns._o 143 if new_ns is None: 144 new_ns__o = None 145 else: 146 new_ns__o = new_ns._o 147 if node is None: 148 node__o = None 149 else: 150 node__o = node._o 151 _xmlextra.replace_ns(node__o, old_ns__o, new_ns__o) 152 if old_ns__o: 153 _xmlextra.remove_ns(node__o, old_ns__o)
154 155 pure_python = False 156 157 except ImportError: 158 ######################################################################### 159 # Pure python implementation (slow workarounds for libxml2 limitations) 160 #-----------------------------------------------------------------------
161 - class error(Exception):
162 """Exception raised on a stream parse error.""" 163 pass
164
165 - def _escape(data):
166 """Escape data for XML""" 167 data=data.replace("&","&amp;") 168 data=data.replace("<","&lt;") 169 data=data.replace(">","&gt;") 170 data=data.replace("'","&apos;") 171 data=data.replace('"',"&quot;") 172 return data
173
174 - class _SAXCallback(libxml2.SAXCallback):
175 """SAX events handler for the python-only stream parser."""
176 - def __init__(self, handler):
177 """Initialize the SAX handler. 178 179 :Parameters: 180 - `handler`: Object to handle stream start, end and stanzas. 181 :Types: 182 - `handler`: `StreamHandler` 183 """ 184 self._handler = handler 185 self._head = "" 186 self._tail = "" 187 self._current = "" 188 self._level = 0 189 self._doc = None 190 self._root = None
191
192 - def cdataBlock(self, data):
193 "" 194 if self._level>1: 195 self._current += _escape(data)
196
197 - def characters(self, data):
198 "" 199 if self._level>1: 200 self._current += _escape(data)
201
202 - def comment(self, content):
203 "" 204 pass
205
206 - def endDocument(self):
207 "" 208 pass
209
210 - def endElement(self, tag):
211 "" 212 self._current+="</%s>" % (tag,) 213 self._level -= 1 214 if self._level > 1: 215 return 216 if self._level==1: 217 xml=self._head+self._current+self._tail 218 doc=libxml2.parseDoc(xml) 219 try: 220 node = doc.getRootElement().children 221 try: 222 node1 = node.docCopyNode(self._doc, 1) 223 try: 224 self._root.addChild(node1) 225 self._handler.stanza(self._doc, node1) 226 except: 227 node1.unlinkNode() 228 node1.freeNode() 229 del node1 230 finally: 231 del node 232 finally: 233 doc.freeDoc() 234 else: 235 xml=self._head+self._tail 236 doc=libxml2.parseDoc(xml) 237 try: 238 self._handler.stream_end(self._doc) 239 self._doc.freeDoc() 240 self._doc = None 241 self._root = None 242 finally: 243 doc.freeDoc()
244
245 - def error(self, msg):
246 "" 247 self._handler.error(msg)
248 249 fatalError = error 250 251 ignorableWhitespace = characters 252
253 - def reference(self, name):
254 "" 255 self._current += "&" + name + ";"
256
257 - def startDocument(self):
258 "" 259 pass
260
261 - def startElement(self, tag, attrs):
262 "" 263 s = "<"+tag 264 if attrs: 265 for a,v in attrs.items(): 266 s+=" %s='%s'" % (a,_escape(v)) 267 s += ">" 268 if self._level == 0: 269 self._head = s 270 self._tail = "</%s>" % (tag,) 271 xml=self._head+self._tail 272 self._doc = libxml2.parseDoc(xml) 273 self._handler.stream_start(self._doc) 274 self._root = self._doc.getRootElement() 275 elif self._level == 1: 276 self._current = s 277 else: 278 self._current += s 279 self._level += 1
280
281 - def warning(self):
282 "" 283 pass
284
285 - class _PythonReader:
286 """Python-only stream reader."""
287 - def __init__(self,handler):
288 """Initialize the reader. 289 290 :Parameters: 291 - `handler`: Object to handle stream start, end and stanzas. 292 :Types: 293 - `handler`: `StreamHandler` 294 """ 295 self.handler = handler 296 self.sax = _SAXCallback(handler) 297 self.parser = libxml2.createPushParser(self.sax, '', 0, 'stream')
298
299 - def feed(self, data):
300 """Feed the parser with a chunk of data. Apropriate methods 301 of `self.handler` will be called whenever something interesting is 302 found. 303 304 :Parameters: 305 - `data`: the chunk of data to parse. 306 :Types: 307 - `data`: `str`""" 308 return self.parser.parseChunk(data, len(data), 0)
309 310 _create_reader = _PythonReader 311
312 - def _get_ns(node):
313 """Get namespace of node. 314 315 :return: the namespace object or `None` if the node has no namespace 316 assigned. 317 :returntype: `libxml2.xmlNs`""" 318 try: 319 return node.ns() 320 except libxml2.treeError: 321 return None
322
323 - def replace_ns(node, old_ns, new_ns):
324 """Replace namespaces in a whole subtree. 325 326 :Parameters: 327 - `node`: the root of the subtree where namespaces should be replaced. 328 - `old_ns`: the namespace to replace. 329 - `new_ns`: the namespace to be used instead of old_ns. 330 :Types: 331 - `node`: `libxml2.xmlNode` 332 - `old_ns`: `libxml2.xmlNs` 333 - `new_ns`: `libxml2.xmlNs` 334 335 Both old_ns and new_ns may be None meaning no namespace set.""" 336 337 if old_ns is not None: 338 old_ns_uri = old_ns.content 339 old_ns_prefix = old_ns.name 340 else: 341 old_ns_uri = None 342 old_ns_prefix = None 343 344 ns = _get_ns(node) 345 if ns is None and old_ns is None: 346 node.setNs(new_ns) 347 elif ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 348 node.setNs(new_ns) 349 350 p = node.properties 351 while p: 352 ns = _get_ns(p) 353 if ns is None and old_ns is None: 354 p.setNs(new_ns) 355 if ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 356 p.setNs(new_ns) 357 p = p.next 358 359 n = node.children 360 while n: 361 if n.type == 'element': 362 skip_element = False 363 try: 364 nsd = n.nsDefs() 365 except libxml2.treeError: 366 nsd = None 367 while nsd: 368 if nsd.name == old_ns_prefix: 369 skip_element = True 370 break 371 nsd = nsd.next 372 if not skip_element: 373 replace_ns(n, old_ns, new_ns) 374 n = n.next
375 376 pure_python = True 377 378 ########################################################### 379 # Common code 380 #------------- 381
382 -def get_node_ns(xmlnode):
383 """Namespace of an XML node. 384 385 :Parameters: 386 - `xmlnode`: the XML node to query. 387 :Types: 388 - `xmlnode`: `libxml2.xmlNode` 389 390 :return: namespace of the node or `None` 391 :returntype: `libxml2.xmlNs`""" 392 try: 393 return xmlnode.ns() 394 except libxml2.treeError: 395 return None
396
397 -def get_node_ns_uri(xmlnode):
398 """Return namespace URI of an XML node. 399 400 :Parameters: 401 - `xmlnode`: the XML node to query. 402 :Types: 403 - `xmlnode`: `libxml2.xmlNode` 404 405 :return: namespace URI of the node or `None` 406 :returntype: `unicode`""" 407 ns=get_node_ns(xmlnode) 408 if ns: 409 return unicode(ns.getContent(),"utf-8") 410 else: 411 return None
412
413 -def xml_node_iter(nodelist):
414 """Iterate over sibling XML nodes. All types of nodes will be returned 415 (not only the elements). 416 417 Usually used to iterade over node's children like this:: 418 419 xml_node_iter(node.children) 420 421 :Parameters: 422 - `nodelist`: start node of the list. 423 :Types: 424 - `nodelist`: `libxml2.xmlNode` 425 """ 426 node = nodelist 427 while node: 428 yield node 429 node = node.next
430
431 -def xml_element_iter(nodelist):
432 """Iterate over sibling XML elements. Non-element nodes will be skipped. 433 434 Usually used to iterade over node's children like this:: 435 436 xml_node_iter(node.children) 437 438 :Parameters: 439 - `nodelist`: start node of the list. 440 :Types: 441 - `nodelist`: `libxml2.xmlNode` 442 """ 443 node = nodelist 444 while node: 445 if node.type == "element": 446 yield node 447 node = node.next
448
449 -def xml_element_ns_iter(nodelist, ns_uri):
450 """Iterate over sibling XML elements. Only elements in the given namespace will be returned. 451 452 Usually used to iterade over node's children like this:: 453 454 xml_node_iter(node.children) 455 456 :Parameters: 457 - `nodelist`: start node of the list. 458 :Types: 459 - `nodelist`: `libxml2.xmlNode` 460 """ 461 node = nodelist 462 while node: 463 if node.type == "element" and get_node_ns_uri(node)==ns_uri: 464 yield node 465 node = node.next
466 467 evil_characters_re=re.compile(r"[\000-\010\013\014\016-\037]",re.UNICODE) 468 utf8_replacement_char=u"\ufffd".encode("utf-8") 469
470 -def remove_evil_characters(s):
471 """Remove control characters (not allowed in XML) from a string.""" 472 if isinstance(s,unicode): 473 return evil_characters_re.sub(u"\ufffd",s) 474 else: 475 return evil_characters_re.sub(utf8_replacement_char,s)
476 477 bad_nsdef_replace_re=re.compile(r"^([^<]*\<[^><]*\s+)(xmlns=((\"[^\"]*\")|(\'[^\']*\')))") 478
479 -def safe_serialize(xmlnode):
480 """Serialize an XML element making sure the result is sane. 481 482 Remove control characters and invalid namespace declarations from the 483 result string. 484 485 :Parameters: 486 - `xmlnode`: the XML element to serialize. 487 :Types: 488 - `xmlnode`: `libxml2.xmlNode` 489 490 :return: UTF-8 encoded serialized and sanitized element. 491 :returntype: `string`""" 492 try: 493 ns = xmlnode.ns() 494 except libxml2.treeError: 495 ns = None 496 try: 497 nsdef = xmlnode.nsDefs() 498 except libxml2.treeError: 499 nsdef = None 500 s=xmlnode.serialize(encoding="UTF-8") 501 while nsdef: 502 if nsdef.name is None and (not ns or (nsdef.name, nsdef.content)!=(ns.name, ns.content)): 503 s = bad_nsdef_replace_re.sub("\\1",s,1) 504 break 505 nsdef = nsdef.next 506 s=remove_evil_characters(s) 507 return s
508
509 -class StreamReader:
510 """A simple push-parser interface for XML streams."""
511 - def __init__(self,handler):
512 """Initialize `StreamReader` object. 513 514 :Parameters: 515 - `handler`: handler object for the stream content 516 :Types: 517 - `handler`: `StreamHandler` derived class 518 """ 519 self.reader=_create_reader(handler) 520 self.lock=threading.RLock() 521 self.in_use=0
522 - def doc(self):
523 """Get the document being parsed. 524 525 :return: the document. 526 :returntype: `libxml2.xmlNode`""" 527 ret=self.reader.doc() 528 if ret: 529 return libxml2.xmlDoc(ret) 530 else: 531 return None
532 - def feed(self,s):
533 """Pass a string to the stream parser. 534 535 Parameters: 536 - `s`: string to parse. 537 Types: 538 - `s`: `str` 539 540 :return: `None` on EOF, `False` when whole input was parsed and `True` 541 if there is something still left in the buffer.""" 542 self.lock.acquire() 543 if self.in_use: 544 self.lock.release() 545 raise StreamParseError,"StreamReader.feed() is not reentrant!" 546 self.in_use=1 547 try: 548 return self.reader.feed(s) 549 finally: 550 self.in_use=0 551 self.lock.release()
552 553 554 # vi: sts=4 et sw=4 555