For those who might be interested, I've added a few things to the code, so now it displays the siblings number, e.g. 1 in "3.1", the text value from the element, and the number of identical siblings.
xml_root = etree.fromstring(xml_file_content)
raw_tree = etree.ElementTree(xml_root)
nice_tree = collections.OrderedDict()
for tag in xml_root.iter():
path = re.sub('\[[0-9]+\]', '', raw_tree.getpath(tag))
if path not in nice_tree:
nice_tree[path] = {'attribs':[], 'values': '', 'count': 1}
else:
nice_tree[path]['count'] += 1
if len(tag.keys()) > 0:
nice_tree[path]['attribs'].extend(attrib for attrib in tag.keys() if attrib not in nice_tree[path])
if tag.text:
nice_tree[path]['values'] = ' '.join(tag.text.split())
last_level = -1
sibling = 0
for path, d in nice_tree.items():
this_level = int(path.count('/') - 1)
if this_level == last_level:
sibling += 1
else:
sibling = 0
last_level = this_level
print('{0}{1}.{2}: {3} [{4}] [{5}] [{6}]'.format(' ' * this_level,
this_level,
sibling,
path.split('/')[-1],
', '.join(d['attribs']) if len(d['attribs']) > 0 else '',
d['values'],
d['count']))
The code in my last post was incorrectly calculating siblings id. This is an update which should fix it.
xml_root = etree.fromstring(file_content)
raw_tree = etree.ElementTree(xml_root)
nice_tree = collections.OrderedDict()
for tag in xml_root.iter():
path = re.sub('\[[0-9]+\]', '', raw_tree.getpath(tag))
if path not in nice_tree:
nice_tree[path] = {'attribs':[], 'values':'', 'count':1}
else:
nice_tree[path]['count'] += 1
if len(tag.keys()) > 0:
nice_tree[path]['attribs'].extend(attrib for attrib in tag.keys() if attrib not in nice_tree[path])
if tag.text:
nice_tree[path]['values'] = ' '.join(tag.text.split())
last_level = -1
siblings = []
for path, d in nice_tree.items():
this_level = int(path.count('/') - 1)
if len(siblings)-1 < this_level:
siblings.append(0)
elif this_level == last_level:
siblings[this_level] += 1
elif this_level < last_level:
siblings[last_level] = 0
siblings[this_level] += 1
else:
siblings[this_level] = 0
last_level = this_level
print('{0}{1}.{2}: {3} [{4}] [{5}] [{6}]'.format(' ' * this_level,
this_level,
siblings[this_level],
path.split('/')[-1],
', '.join(d['attribs']) if len(d['attribs']) > 0 else '',
d['values'],
d['count']))