Python Forum

Full Version: Element Tree Xpath
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I am trying to extract certain data from the XML using Element tree, but I am unsure why my code doesn't work, any guidance would be helpful.

It appears that the paths I have used aren't correct as no values are returned. I think I am fairly close..... hopefully?

import requests
from lxml import etree
     
fromDate = "2018-07-29"
 
def getXML():
    url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
    headers = {'content-type': 'application/soap+xml; charset=utf-8'}
    body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
  <soap12:Body>
    <GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
  </soap12:Body>
</soap12:Envelope>"""
 
    response = requests.post(url,data=body,headers=headers)
    return response.content
 
import pandas as pd
df1 = pd.DataFrame(columns=("applicable_at","name","value","created_date"))   
for pd_date in pd.date_range(fromDate, periods=1):
    day = pd_date.strftime('%Y-%m-%d')
      
    root = etree.fromstring(getXML())
    #map prefix 'd' to the default namespace URI
    ns = {'d': 'http://www.nationalgrid.com/EDP/BusinessEntities/Public'}
      
    publication_objects = root.xpath('//d:EDPObjectCollection', namespaces=ns)
      
    for obj in publication_objects:
        name = obj.find('d:EDPObjectName', ns).text
      
        for data in obj.findall('d:EnergyDataList/d:EDPEnergyDataBE', ns):
            applicable_at = pd.to_datetime(data.find('d:ApplicableAt', ns).text)    
            value = float(data.find('d:FlowRate', ns).text)
            created_date = pd.to_datetime(data.find('d:ScheduleTime', ns).text)
 
         
          
            df1.loc[len(df1) +1] = [applicable_at,name, value,created_date]
Did you check if response.content contains the expected data?
Yep. I printed response.content and it contains the XML that I need.

The problem is to do with the paths I have set element tree to look at. I think they might not Ben right.
The response is returned as bytes. What happens if you convert it to string?
It should work as bytes. It’s something to do with the xml structure and how I’m trying to extract it
The code "doesn't work" how? Getting error, do not get a result?
(Aug-05-2018, 12:25 PM)wavic Wrote: [ -> ]The code "doesn't work" how? Getting error, do not get a result?


Here is a working example of what I want to do. However with the example I am stuck on, the XML returned from the API has a more complicated structure and when I try and get Element tree to pickup certain elements of the tree, it won't do it.

Working Example:
import requests
from lxml import etree
toDate = "2018-04-25"
fromDate = "2018-04-25"
dateType = "gasday"

def getXML():
    url="http://marketinformation.natgrid.co.uk/MIPIws-public/public/publicwebservice.asmx"
    headers = {'content-type': 'application/soap+xml; charset=utf-8'}
    body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
        <soap12:Body>
            <GetPublicationDataWM xmlns="http://www.NationalGrid.com/MIPI/">
                <reqObject>
                    <LatestFlag>Y</LatestFlag>
                    <ApplicableForFlag>Y</ApplicableForFlag>
                    <ToDate>%s</ToDate>
                    <FromDate>%s</FromDate>
                    <DateType>%s</DateType>
                    <PublicationObjectNameList>
                    <string>Nominations, Prevailing Nomination, STFergus-NSMP, Sub Terminal</string>
                    <string>Nominations, Prevailing Nomination, Stublach, Storage Entry</string>
                    <string>Nominations, Prevailing Nomination, Stublach, Storage Exit</string>
                    <string>Nominations, Prevailing Nomination, Teesside-BP, Sub Terminal</string>
                    <string>Nominations, Prevailing Nomination, Teesside-PX, Sub Terminal</string>
                    <string>Nominations, Prevailing Nomination, Theddlethorpe, Sub Terminal</string>
                    </PublicationObjectNameList>
                </reqObject>
            </GetPublicationDataWM>
        </soap12:Body>
    </soap12:Envelope>""" % (toDate, fromDate,dateType)

    response = requests.post(url,data=body,headers=headers)
    return response.content
root = etree.fromstring(getXML())
# map prefix 'd' to the default namespace URI
ns = { 'd': 'http://www.NationalGrid.com/MIPI/'}
 

import pandas as pd

df1 = pd.DataFrame(columns=("applicable_at","applicable_for","name","value","quality_indicator","substituted","created_date"))   
for pd_date in pd.date_range(fromDate, periods=1):
    day = pd_date.strftime('%Y-%m-%d')
     
    root = etree.fromstring(getXML())
     
    #map prefix 'd' to the default namespace URI
    ns = {'d': 'http://www.NationalGrid.com/MIPI/'}
     
    publication_objects = root.xpath('//d:CLSMIPIPublicationObjectBE', namespaces=ns)
     
    for obj in publication_objects:
        name = obj.find('d:PublicationObjectName', ns).text
     
        for data in obj.findall('d:PublicationObjectData/d:CLSPublicationObjectDataBE', ns):
            applicable_at = pd.to_datetime(data.find('d:ApplicableAt', ns).text)    
            applicable_for = pd.to_datetime(data.find('d:ApplicableFor', ns).text)
            value = float(data.find('d:Value', ns).text)
            quality_indicator = data.find('d:Value', ns).text
            substituted = data.find('d:Substituted', ns).text
            created_date = pd.to_datetime(data.find('d:CreatedDate', ns).text)
        
         
            df1.loc[len(df1) +1] = [applicable_at, applicable_for,name, value, quality_indicator, substituted, created_date]
My none working example:
import requests
from lxml import etree
      
fromDate = "2018-07-29"
  
def getXML():
    url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
    headers = {'content-type': 'application/soap+xml; charset=utf-8'}
    body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
  <soap12:Body>
    <GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
  </soap12:Body>
</soap12:Envelope>"""
  
    response = requests.post(url,data=body,headers=headers)
    return response.content
  
import pandas as pd
df1 = pd.DataFrame(columns=("applicable_at","name","value","created_date"))   
for pd_date in pd.date_range(fromDate, periods=1):
    day = pd_date.strftime('%Y-%m-%d')
       
    root = etree.fromstring(getXML())
    #map prefix 'd' to the default namespace URI
    ns = {'d': 'http://www.nationalgrid.com/EDP/BusinessEntities/Public'}
       
    publication_objects = root.xpath('//d:EDPObjectCollection', namespaces=ns)
       
    for obj in publication_objects:
        name = obj.find('d:EDPObjectName', ns).text
       
        for data in obj.findall('d:EnergyDataList/d:EDPEnergyDataBE', ns):
            applicable_at = pd.to_datetime(data.find('d:ApplicableAt', ns).text)    
            value = float(data.find('d:FlowRate', ns).text)
            created_date = pd.to_datetime(data.find('d:ScheduleTime', ns).text)
  
          
           
            df1.loc[len(df1) +1] = [applicable_at,name, value,created_date]
Have I set the correct namespace and root path in order to extract data from EDPObjectName?
Well, I managed to get a non-empty publication_objects list.
I don't know how I did it. I am not using lxml and xpath is almost magic for me.
However here it is ( some print functions are added for my convenience ):

import requests
from lxml import etree

fromDate = "2018-07-29"

def getXML():
    url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
    headers = {'content-type': 'application/soap+xml; charset=utf-8'}
    body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
  <soap12:Body>
    <GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
  </soap12:Body>
</soap12:Envelope>"""

    response = requests.post(url,data=body,headers=headers)
    print(response.content)
    return response.content

import pandas as pd
df1 = pd.DataFrame(columns=("applicable_at","name","value","created_date"))
for pd_date in pd.date_range(fromDate, periods=1):
    day = pd_date.strftime('%Y-%m-%d')

    root = etree.fromstring(getXML())
    print(root)
    #map prefix 'd' to the default namespace URI
    ns = {'soap': 'http://www.NationalGrid.com/EDP/BusinessEntities/Public'}

    publication_objects = root.xpath('//soap:EDPObjectCollection', namespaces=ns)
    print('\n\n')
    print(publication_objects)

    for obj in publication_objects:
        name = obj.find('soap:EDPObjectName', ns).text()

        for data in obj.findall('soap:EnergyDataList/d:EDPEnergyDataBE', ns):
            applicable_at = pd.to_datetime(data.find('d:ApplicableAt', ns).text)
            value = float(data.find('soap:FlowRate', ns).text)
            created_date = pd.to_datetime(data.find('soap:ScheduleTime', ns).text)



            df1.loc[len(df1) +1] = [applicable_at,name, value,created_date]
It produces another error but I am too tired. It's an hour after midnight here.
Thanks for your help so far, I haven't had a chance to take a look at the what you posted up as I have been super busy. I will try and take a look this evening.
Does that mean something further up in my code didnt work correctly?

Here is my error:

Error:
Traceback (most recent call last): File "python", line 34, in <module> AttributeError: 'NoneType' object has no attribute 'text'
import requests
from lxml import etree
 
fromDate = "2018-07-29"
 
def getXML():
    url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
    headers = {'content-type': 'application/soap+xml; charset=utf-8'}
    body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
  <soap12:Body>
    <GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
  </soap12:Body>
</soap12:Envelope>"""
 
    response = requests.post(url,data=body,headers=headers)
    print(response.content)
    return response.content
 
import pandas as pd
df1 = pd.DataFrame(columns=("applicable_at","name","value","created_date"))
for pd_date in pd.date_range(fromDate, periods=1):
    day = pd_date.strftime('%Y-%m-%d')
 
    root = etree.fromstring(getXML())
    print(root)
    #map prefix 'd' to the default namespace URI
    ns = {'soap': 'http://www.NationalGrid.com/EDP/BusinessEntities/Public'}
 
    publication_objects = root.xpath('//soap:EDPObjectCollection', namespaces=ns)
    print('\n\n')
    print(publication_objects)
 
    for obj in publication_objects:
        name = obj.find('soap:EDPObjectName', ns).text()
 
        for data in obj.findall('soap:EnergyDataList/d:EDPEnergyDataBE', ns):
            applicable_at = pd.to_datetime(data.find('d:ApplicableAt', ns).text)
            value = float(data.find('soap:FlowRate', ns).text)
            created_date = pd.to_datetime(data.find('soap:ScheduleTime', ns).text)
 
 
 
            df1.loc[len(df1) +1] = [applicable_at,name, value,created_date]
Could it be something to do with when the api response is parsed with etree?