Skip to content

Commit ba07885

Browse files
committed
simplify interface for passing multiple tag selectors into element iteration methods
1 parent 00def23 commit ba07885

File tree

4 files changed

+138
-67
lines changed

4 files changed

+138
-67
lines changed

doc/api.txt

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ children. Using the tree defined above, we get:
196196
>>> [ child.tag for child in root ]
197197
['a', 'b', 'c', 'd']
198198

199-
To iterate in the opposite direction, use the ``reversed()`` function
199+
To iterate in the opposite direction, use the builtin ``reversed()`` function
200200
that exists in Python 2.4 and later.
201201

202202
Tree traversal should use the ``element.iter()`` method:
@@ -229,25 +229,33 @@ Note how ``element.iterdescendants()`` does not include the element
229229
itself, as opposed to ``element.iter()``. The latter effectively
230230
implements the 'descendant-or-self' axis in XPath.
231231

232-
All of these iterators support an additional ``tag`` keyword argument that
233-
filters the generated elements by tag name:
232+
All of these iterators support one (or more, since lxml 2.4) additional
233+
arguments that filter the generated elements by tag name:
234234

235235
.. sourcecode:: pycon
236236

237-
>>> [ child.tag for child in root.iterchildren(tag='a') ]
237+
>>> [ child.tag for child in root.iterchildren('a') ]
238238
['a']
239-
>>> [ child.tag for child in d.iterchildren(tag='a') ]
239+
>>> [ child.tag for child in d.iterchildren('a') ]
240240
[]
241-
>>> [ el.tag for el in root.iterdescendants(tag='d') ]
241+
>>> [ el.tag for el in root.iterdescendants('d') ]
242242
['d']
243-
>>> [ el.tag for el in root.iter(tag='d') ]
243+
>>> [ el.tag for el in root.iter('d') ]
244244
['d']
245+
>>> [ el.tag for el in root.iter('d', 'a') ]
246+
['a', 'd']
247+
248+
Note that the order of the elements is determined by the iteration order,
249+
which is the document order in most cases (except for preceding siblings
250+
and ancestors, where it is the reversed document order). The order of
251+
the tag selection arguments is irrelevant, as you can see in the last
252+
example.
245253

246254
The most common way to traverse an XML tree is depth-first, which
247255
traverses the tree in document order. This is implemented by the
248256
``.iter()`` method. While there is no dedicated method for
249257
breadth-first traversal, it is almost as simple if you use the
250-
``collections.deque`` type from Python 2.4.
258+
``collections.deque`` type that is available in Python 2.4 and later.
251259

252260
.. sourcecode:: pycon
253261

doc/tutorial.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,8 @@ serialised the tree to XML:
479479
another - Child 3
480480

481481
If you know you are only interested in a single tag, you can pass its name to
482-
``iter()`` to have it filter for you:
482+
``iter()`` to have it filter for you. Since lxml 2.4, you can also pass more
483+
than one tag to intercept on multiple tags during iteration.
483484

484485
.. sourcecode:: pycon
485486

@@ -488,6 +489,12 @@ If you know you are only interested in a single tag, you can pass its name to
488489
child - Child 1
489490
child - Child 2
490491

492+
>>> for element in root.iter("another", "child"):
493+
... print("%s - %s" % (element.tag, element.text))
494+
child - Child 1
495+
child - Child 2
496+
another - Child 3
497+
491498
By default, iteration yields all nodes in the tree, including
492499
ProcessingInstructions, Comments and Entity instances. If you want to
493500
make sure only Element objects are returned, you can pass the

src/lxml/lxml.etree.pyx

Lines changed: 83 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,52 +1277,61 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
12771277
return None
12781278
return _elementFactory(self._doc, c_node)
12791279

1280-
def itersiblings(self, tag=None, *, preceding=False):
1281-
u"""itersiblings(self, tag=None, preceding=False)
1280+
def itersiblings(self, tag=None, *tags, preceding=False):
1281+
u"""itersiblings(self, tag=None, *tags, preceding=False)
12821282
12831283
Iterate over the following or preceding siblings of this element.
12841284
12851285
The direction is determined by the 'preceding' keyword which
12861286
defaults to False, i.e. forward iteration over the following
12871287
siblings. When True, the iterator yields the preceding
12881288
siblings in reverse document order, i.e. starting right before
1289-
the current element and going left. The generated elements
1290-
can be restricted to a specific tag name with the 'tag'
1291-
keyword.
1289+
the current element and going backwards.
1290+
1291+
The returned elements can be restricted to a specific tag name by
1292+
passing a tag or a series of tag names.
12921293
"""
1293-
return SiblingsIterator(self, tag, preceding=preceding)
1294+
if tag is not None:
1295+
tags += (tag,)
1296+
return SiblingsIterator(self, tags, preceding=preceding)
12941297

1295-
def iterancestors(self, tag=None):
1296-
u"""iterancestors(self, tag=None)
1298+
def iterancestors(self, tag=None, *tags):
1299+
u"""iterancestors(self, tag=None, *tags)
12971300
12981301
Iterate over the ancestors of this element (from parent to parent).
12991302
1300-
The generated elements can be restricted to a specific tag name with
1301-
the 'tag' keyword.
1303+
The returned elements can be restricted to a specific tag name by
1304+
passing a tag or a series of tag names.
13021305
"""
1303-
return AncestorsIterator(self, tag)
1306+
if tag is not None:
1307+
tags += (tag,)
1308+
return AncestorsIterator(self, tags)
13041309

1305-
def iterdescendants(self, tag=None):
1306-
u"""iterdescendants(self, tag=None)
1310+
def iterdescendants(self, tag=None, *tags):
1311+
u"""iterdescendants(self, tag=None, *tags)
13071312
13081313
Iterate over the descendants of this element in document order.
13091314
13101315
As opposed to ``el.iter()``, this iterator does not yield the element
1311-
itself. The generated elements can be restricted to a specific tag
1312-
name with the 'tag' keyword.
1316+
itself. The returned elements can be restricted to a specific tag
1317+
name by passing a tag or a series of tag names.
13131318
"""
1314-
return ElementDepthFirstIterator(self, tag, inclusive=False)
1319+
if tag is not None:
1320+
tags += (tag,)
1321+
return ElementDepthFirstIterator(self, tags, inclusive=False)
13151322

1316-
def iterchildren(self, tag=None, *, reversed=False):
1317-
u"""iterchildren(self, tag=None, reversed=False)
1323+
def iterchildren(self, tag=None, *tags, reversed=False):
1324+
u"""iterchildren(self, tag=None, *tags, reversed=False)
13181325
13191326
Iterate over the children of this element.
13201327
1321-
As opposed to using normal iteration on this element, the generated
1322-
elements can be restricted to a specific tag name with the 'tag'
1323-
keyword and reversed with the 'reversed' keyword.
1328+
As opposed to using normal iteration on this element, the returned
1329+
elements can be restricted to a specific tag name by passing a tag
1330+
or a series of tag names, and reversed with the 'reversed' keyword.
13241331
"""
1325-
return ElementChildIterator(self, tag, reversed=reversed)
1332+
if tag is not None:
1333+
tags += (tag,)
1334+
return ElementChildIterator(self, tags, reversed=reversed)
13261335

13271336
def getroottree(self):
13281337
u"""getroottree(self)
@@ -1336,15 +1345,17 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
13361345
_assertValidDoc(self._doc)
13371346
return _elementTreeFactory(self._doc, None)
13381347

1339-
def getiterator(self, tag=None):
1340-
u"""getiterator(self, tag=None)
1348+
def getiterator(self, tag=None, *tags):
1349+
u"""getiterator(self, tag=None, *tags)
13411350
13421351
Returns a sequence or iterator of all elements in the subtree in
13431352
document order (depth first pre-order), starting with this
13441353
element.
13451354
13461355
Can be restricted to find only elements with a specific tag
1347-
(pass ``tag="xyz"``) or from a namespace (pass ``tag="{ns}*"``).
1356+
(pass ``"xyz"`` as tag) or from a namespace (pass ``"{ns}*"`` as tag).
1357+
Passing a sequence of tags will let the iterator return all
1358+
elements matching any of these tags, in document order.
13481359
13491360
You can also pass the Element, Comment, ProcessingInstruction and
13501361
Entity factory functions to look only for the specific element type.
@@ -1357,34 +1368,43 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
13571368
method in new code if you require backwards compatibility
13581369
with older versions of lxml or ElementTree.
13591370
"""
1360-
return ElementDepthFirstIterator(self, tag)
1371+
if tag is not None:
1372+
tags += (tag,)
1373+
return ElementDepthFirstIterator(self, tags)
13611374

1362-
def iter(self, tag=None):
1363-
u"""iter(self, tag=None)
1375+
def iter(self, tag=None, *tags):
1376+
u"""iter(self, tag=None, *tags)
13641377
13651378
Iterate over all elements in the subtree in document order (depth
13661379
first pre-order), starting with this element.
13671380
13681381
Can be restricted to find only elements with a specific tag
1369-
(pass ``tag="xyz"``) or from a namespace (pass ``tag="{ns}*"``).
1382+
(pass ``"xyz"`` as tag) or from a namespace (pass ``"{ns}*"`` as tag).
1383+
Passing a sequence of tags will let the iterator return all
1384+
elements matching any of these tags, in document order.
13701385
13711386
You can also pass the Element, Comment, ProcessingInstruction and
13721387
Entity factory functions to look only for the specific element type.
13731388
"""
1374-
return ElementDepthFirstIterator(self, tag)
1389+
if tag is not None:
1390+
tags += (tag,)
1391+
return ElementDepthFirstIterator(self, tags)
13751392

1376-
def itertext(self, tag=None, *, with_tail=True):
1377-
u"""itertext(self, tag=None, with_tail=True)
1393+
def itertext(self, tag=None, *tags, with_tail=True):
1394+
u"""itertext(self, tag=None, *tags, with_tail=True)
13781395
13791396
Iterates over the text content of a subtree.
13801397
1381-
You can pass the ``tag`` keyword argument to restrict text content to
1382-
a specific tag name.
1398+
You can pass a tag name to restrict text content to a specific tag
1399+
name. Passing a sequence of tags will let the iterator consider
1400+
all elements matching any of these tags.
13831401
13841402
You can set the ``with_tail`` keyword argument to ``False`` to skip
13851403
over tail text.
13861404
"""
1387-
return ElementTextIterator(self, tag, with_tail=with_tail)
1405+
if tag is not None:
1406+
tags += (tag,)
1407+
return ElementTextIterator(self, tags, with_tail=with_tail)
13881408

13891409
def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
13901410
u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
@@ -1915,15 +1935,16 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
19151935
tree.xmlFree(c_path)
19161936
return path
19171937

1918-
def getiterator(self, tag=None):
1919-
u"""getiterator(self, tag=None)
1938+
def getiterator(self, tag=None, *tags):
1939+
u"""getiterator(self, *tags, tag=None)
19201940
19211941
Returns a sequence or iterator of all elements in document order
19221942
(depth first pre-order), starting with the root element.
19231943
19241944
Can be restricted to find only elements with a specific tag
1925-
(pass ``tag="xyz"`` or ``tag="{ns}xyz"``) or from a namespace
1926-
(pass ``tag="{ns}*"``).
1945+
(pass ``"xyz"`` as tag) or from a namespace (pass ``"{ns}*"`` as tag).
1946+
Passing a sequence of tags will let the iterator return all
1947+
elements matching any of these tags, in document order.
19271948
19281949
You can also pass the Element, Comment, ProcessingInstruction and
19291950
Entity factory functions to look only for the specific element type.
@@ -1939,18 +1960,22 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
19391960
root = self.getroot()
19401961
if root is None:
19411962
return ()
1942-
return root.getiterator(tag)
1963+
if tag is not None:
1964+
tags += (tag,)
1965+
return root.getiterator(*tags)
19431966

1944-
def iter(self, tag=None):
1945-
u"""iter(self, tag=None)
1967+
def iter(self, tag=None, *tags):
1968+
u"""iter(self, tag=None, *tags)
19461969
19471970
Creates an iterator for the root element. The iterator loops over
19481971
all elements in this tree, in document order.
19491972
"""
19501973
root = self.getroot()
19511974
if root is None:
19521975
return ()
1953-
return root.iter(tag)
1976+
if tag is not None:
1977+
tags += (tag,)
1978+
return root.iter(*tags)
19541979

19551980
def find(self, path, namespaces=None):
19561981
u"""find(self, path, namespaces=None)
@@ -2424,24 +2449,24 @@ cdef class _MultiTagMatcher:
24242449
self._tag_count = 0
24252450
if self._cached_tags:
24262451
for i in xrange(count):
2427-
python.Py_XDECREF(self._cached_tags[i].href)
2452+
cpython.ref.Py_XDECREF(self._cached_tags[i].href)
24282453
cpython.mem.PyMem_Free(self._cached_tags)
24292454
self._cached_tags = NULL
24302455

24312456
cdef initTagMatch(self, tags):
24322457
self._cached_doc = None
24332458
del self._py_tags[:]
24342459
self._clear()
2435-
if tags is None:
2436-
# match anything
2460+
if tags is None or tags == ():
2461+
# no selection in tags argument => match anything
24372462
self._node_types = (
24382463
1 << tree.XML_COMMENT_NODE |
24392464
1 << tree.XML_PI_NODE |
24402465
1 << tree.XML_ENTITY_REF_NODE |
24412466
1 << tree.XML_ELEMENT_NODE)
2442-
return
2443-
self._node_types = 0
2444-
self._storeTags(tags, set())
2467+
else:
2468+
self._node_types = 0
2469+
self._storeTags(tags, set())
24452470

24462471
cdef _storeTags(self, tag, set seen):
24472472
if tag is Comment:
@@ -2456,10 +2481,13 @@ cdef class _MultiTagMatcher:
24562481
if tag in seen:
24572482
return
24582483
seen.add(tag)
2459-
href, name = _getNsTag(tag)
2460-
if name == b'*':
2461-
name = None
2462-
self._py_tags.append((href, name))
2484+
if tag in ('*', '{*}*'):
2485+
self._node_types |= 1 << tree.XML_ELEMENT_NODE
2486+
else:
2487+
href, name = _getNsTag(tag)
2488+
if name == b'*':
2489+
name = None
2490+
self._py_tags.append((href, name))
24632491
else:
24642492
# support a sequence of tags
24652493
for item in tag:
@@ -2476,6 +2504,7 @@ cdef class _MultiTagMatcher:
24762504
self._tag_count = 0
24772505
if not self._py_tags:
24782506
self._cached_doc = doc
2507+
self._cached_size = dict_size
24792508
return 0
24802509
if not self._cached_tags:
24812510
self._cached_tags = <qname*>cpython.mem.PyMem_Malloc(len(self._py_tags) * sizeof(qname))
@@ -2599,7 +2628,7 @@ cdef class ElementDepthFirstIterator:
25992628
instructions. To filter them out, check if the ``tag`` property
26002629
of the returned element is a string (i.e. not None and not a
26012630
factory function), or pass the ``Element`` factory for the ``tag``
2602-
keyword.
2631+
argument to receive only Elements.
26032632
26042633
If the optional ``tag`` argument is not None, the iterator returns only
26052634
the elements that match the respective name and namespace.

src/lxml/tests/test_etree.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2195,19 +2195,46 @@ def test_getiterator_filter_multiple(self):
21952195

21962196
self.assertEquals(
21972197
[a, b],
2198-
list(a.getiterator( ('a', 'b') )))
2198+
list(a.getiterator('a', 'b')))
2199+
self.assertEquals(
2200+
[],
2201+
list(a.getiterator('x', 'y')))
2202+
self.assertEquals(
2203+
[a, f],
2204+
list(a.getiterator('f', 'a')))
2205+
self.assertEquals(
2206+
[c, e, f],
2207+
list(c.getiterator('c', '*', 'a')))
2208+
self.assertEquals(
2209+
[],
2210+
list(a.getiterator( (), () )))
2211+
2212+
def test_getiterator_filter_multiple_tuple(self):
2213+
Element = self.etree.Element
2214+
SubElement = self.etree.SubElement
2215+
2216+
a = Element('a')
2217+
b = SubElement(a, 'b')
2218+
c = SubElement(a, 'c')
2219+
d = SubElement(b, 'd')
2220+
e = SubElement(c, 'e')
2221+
f = SubElement(c, 'f')
2222+
2223+
self.assertEquals(
2224+
[a, b],
2225+
list(a.getiterator( ('a', 'b') )))
21992226
self.assertEquals(
22002227
[],
22012228
list(a.getiterator( ('x', 'y') )))
22022229
self.assertEquals(
22032230
[a, f],
2204-
list(a.getiterator( ('f', 'a') )))
2231+
list(a.getiterator( ('f', 'a') )))
22052232
self.assertEquals(
22062233
[c, e, f],
2207-
list(c.getiterator( ('c', '*', 'a') )))
2234+
list(c.getiterator( ('c', '*', 'a') )))
22082235
self.assertEquals(
22092236
[],
2210-
list(a.getiterator( () )))
2237+
list(a.getiterator( () )))
22112238

22122239
def test_getiterator_filter_namespace(self):
22132240
Element = self.etree.Element

0 commit comments

Comments
 (0)