Dynamically changing Xpath

AgileAVS · Jun-17-2021, 10:03 AM

Hey I have been trying to extract data from table from this website:
https://www.macrotrends.net/stocks/chart...ial-ratios

but the problem is that every time the page refreshes, the full Xpath also changes. Besides, I have also tried extracting with Panda and BeautifulSoup, I did fail with the last two approaches as I mostly use Selenium only. Here's the code I have written using modules: Selenium and XLS Writer. While the code works, the data is misplaced everytime I run the code as the website dynamically changes the Xpath.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import xlsxwriter

workbook = xlsxwriter.Workbook('kr_1_BAC.xlsx')
worksheet = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})
money_format = workbook.add_format({'num_format': '$#,##0'})
worksheet.write('A1', 'Particulars', bold)
worksheet.write('B1', '2020', bold)
worksheet.write('C1', '2019', bold)
worksheet.write('D1', '2018', bold)
worksheet.write('E1', '2017', bold)
worksheet.write('F1', '2016', bold)
worksheet.write('G1', '2015', bold)
worksheet.write('H1', '2014', bold)
worksheet.write('I1', '2013', bold)
worksheet.write('J1', '2012', bold)
worksheet.write('K1', '2011', bold)
worksheet.write('L1', '2010', bold)
worksheet.write('M1', '2009', bold)
worksheet.write('N1', '2008', bold)
worksheet.write('O1', '2007', bold)
worksheet.write('P1', '2006', bold)
worksheet.write('Q1', '2005', bold)

options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
#options.add_argument('-headless')
browser = webdriver.Chrome(options=options)
#browser = webdriver.Chrome()
#options.headless = True

url = 'https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios'
browser.get(url)

#Creating Lists:
list_ind=[]
list_yr1=[]
list_yr2=[]
list_yr3=[]
list_yr4=[]
list_yr5=[]
list_yr6=[]
list_yr7=[]
list_yr8=[]
list_yr9=[]
list_yr10=[]
list_yr11=[]
list_yr12=[]
list_yr13=[]
list_yr14=[]
list_yr15=[]
list_yr16=[]

#from 2020-2015:
for n1 in range(1,21):
    ind=browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[1]/div')
    ind=ind.text
    y1 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[8]/div')
    y1 = y1.text
    y2 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[7]/div')
    y2 = y2.text
    y3 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[5]/div')
    y3 = y3.text
    y4 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[6]/div')
    y4 = y4.text
    y5 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[4]/div')
    y5 = y5.text
    y6 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[3]/div')
    y6 = y6.text
    indd = str(ind)
    y11 = str(y1)
    y21 = str(y2)
    y31 = str(y3)
    y41 = str(y4)
    y51 = str(y5)
    y61 = str(y6)
    list_ind.append(indd)
    list_yr1.append(y11)
    list_yr2.append(y21)
    list_yr3.append(y31)
    list_yr4.append(y41)
    list_yr5.append(y51)
    list_yr6.append(y61)
#from 2014-12:
#Clicking on the right button 33 times to scroll the table to the right as the xpath are named the same
m=33
while m>0:
    element_to_hover_over1 = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div/div/div[6]/div/div/div[5]/div')
    hover = ActionChains(browser).move_to_element(element_to_hover_over1).click().perform()
    m=m-1
    print('m',m)
    #hover1 = ActionChains(browser).click
    #time.sleep(1)
    continue
    #Back to extacting data
for n2 in range(1, 21):
    y7 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n2}]/div[9]/div')
    y7 = y7.text
    y8 = browser.find_element_by_xpath(f'//html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n2}]/div[10]/div')
    y8 = y8.text
    y9 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n2}]/div[8]/div')
    y9 = y9.text
    y71 = str(y7)
    y81 = str(y8)
    y91 = str(y9)
    list_yr7.append(y71)
    list_yr8.append(y81)
    list_yr9.append(y91)
time.sleep(3)
#Moving right by clicking on the right button for 35 times
m2=35
while m2>0:
    element_to_hover_over_r = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div/div/div[6]/div/div/div[5]/div')
    hover = ActionChains(browser).move_to_element(element_to_hover_over_r).click().perform()
    m2=m2-1
    print('m',m2)
    #hover1 = ActionChains(browser).click
    #time.sleep(1)
    continue
time.sleep(2)
#2011 only
for n3 in range(1, 21):
    y10 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n3}]/div[9]/div')
    y10 = y10.text
    y101 = str(y10)
    list_yr10.append(y101)
time.sleep(3)
#From 2010-2005/ Scrolling all the way to right
hover5 = ActionChains(browser).click_and_hold(on_element=element_to_hover_over_r).perform()
#time.sleep(1)
for n4 in range(1, 21):
    y11 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[7]/div')
    y11 = y11.text
    y12 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[8]/div')
    y12 = y12.text
    y13 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[9]/div')
    y13 = y13.text
    y14 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[10]/div')
    y14 = y14.text
    y15 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[11]/div')
    y15 = y15.text
    y16 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[12]/div')
    y16 = y16.text
    y111=str(y11)
    y121=str(y12)
    y131=str(y13)
    y141=str(y14)
    y151=str(y15)
    y161=str(y16)
    list_yr11.append(y111)
    list_yr12.append(y121)
    list_yr13.append(y131)
    list_yr14.append(y141)
    list_yr15.append(y151)
    list_yr16.append(y161)
#Counting for XLS writer:
n= len(list_ind)
# Concluding Excel:
row = 1
col = 0
for item1 in (list_ind):
    worksheet.write(row, col, item1)
    row += 1
for item2 in (list_yr1):
    worksheet.write(row-(n*1), col + 1, item2)
    row += 1
for item3 in (list_yr2):
    worksheet.write(row-(n*2), col + 2, item3)
    row += 1
for item4 in (list_yr3):
    worksheet.write(row -(n*3) , col + 3, item4)
    row += 1
for item5 in (list_yr4):
    worksheet.write(row - (n * 4), col + 4, item5)
    row += 1
for item6 in (list_yr5):
    worksheet.write(row - (n * 5), col + 5, item6)
    row += 1
for item7 in (list_yr6):
    worksheet.write(row - (n * 6), col + 6, item7)
    row += 1
for item8 in (list_yr7):
    worksheet.write(row - (n * 7), col + 7, item8)
    row += 1
for item9 in (list_yr8):
    worksheet.write(row - (n * 8), col + 8, item9)
    row += 1
for item10 in (list_yr9):
    worksheet.write(row - (n * 9), col + 9, item10)
    row += 1
for item11 in (list_yr10):
    worksheet.write(row - (n * 10), col + 10, item11)
    row += 1
for item12 in (list_yr11):
    worksheet.write(row - (n * 11), col + 11, item12)
    row += 1
for item13 in (list_yr12):
    worksheet.write(row - (n * 12), col + 12, item13)
    row += 1
for item14 in (list_yr13):
    worksheet.write(row - (n * 13), col + 13, item14)
    row += 1
for item15 in (list_yr14):
    worksheet.write(row - (n * 14), col + 14, item15)
    row += 1
for item16 in (list_yr15):
    worksheet.write(row - (n * 15), col + 15, item16)
    row += 1
for item17 in (list_yr16):
    worksheet.write(row - (n * 16), col + 16, item17)
    row += 1
workbook.close()
browser.quit()

***snippsat*** · Jun-17-2021, 12:57 PM

Your code what can i say could be a lot better.
There are many loops,but a many are unnecessary and code still repeating to much in the loops🥨
When code start to get over 200 lines or a lot less,it's about time to think some structure like eg function.

Can giver you some tips and using Pandas is for me an better an easier option than using xlsxwriter.
In Pandas you get similar look that Excel when finish then is just to use df.to_excel().

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from io import StringIO
import pandas as pd

#--| Setup
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(executable_path=r'C:\cmder\bin\chromedriver.exe', options=options)
#--| Parse or automation
url = "https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios"
browser.get(url)
time.sleep(3)

# Make a loop for 10 rows
lst = []
for number in range(1,11):
    row = browser.find_elements_by_css_selector(f'#row{number}jqxgrid')
    lst.append(row[0].text.replace('\n', ','))

# Make DataFrame
df = pd.read_csv(StringIO('\n'.join(lst)), sep=",")
print(df)
# Write to excel
df.to_excel("output.xlsx", index=False)

Output:   Long-term Debt / Capital   0.4907   0.4763  ...   0.4598   0.4489   0.4803
0         Debt/Equity Ratio   1.9196   1.9389  ...   1.9395    1.781   1.9757
1              Gross Margin        -        -  ...        -        -        -
2          Operating Margin        -        -  ...        -        -        -
3               EBIT Margin        -        -  ...        -        -        -
4             EBITDA Margin        -        -  ...        -        -        -
5     Pre-Tax Profit Margin   25.597  37.3674  ...  34.8895  31.2356  27.8019
6         Net Profit Margin  22.1984  29.6598  ...  19.8471  20.1488   18.078
7            Asset Turnover   0.0263    0.036  ...   0.0367   0.0366   0.0372
8  Inventory Turnover Ratio        -        -  ...        -        -        -

AgileAVS · Jun-17-2021, 02:33 PM

(Jun-17-2021, 12:57 PM)snippsat Wrote: Your code what can i say could be a lot better.
There are many loops,but a many are unnecessary and code still repeating to much in the loops🥨
When code start to get over 200 lines or a lot less,it's about time to think some structure like eg function.

Can giver you some tips and using Pandas is for me an better an easier option than using xlsxwriter.
In Pandas you get similar look that Excel when finish then is just to use df.to_excel().

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from io import StringIO
import pandas as pd

#--| Setup
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(executable_path=r'C:\cmder\bin\chromedriver.exe', options=options)
#--| Parse or automation
url = "https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios"
browser.get(url)
time.sleep(3)

# Make a loop for 10 rows
lst = []
for number in range(1,11):
    row = browser.find_elements_by_css_selector(f'#row{number}jqxgrid')
    lst.append(row[0].text.replace('\n', ','))

# Make DataFrame
df = pd.read_csv(StringIO('\n'.join(lst)), sep=",")
print(df)
# Write to excel
df.to_excel("output.xlsx", index=False)

Output:   Long-term Debt / Capital   0.4907   0.4763  ...   0.4598   0.4489   0.4803
0         Debt/Equity Ratio   1.9196   1.9389  ...   1.9395    1.781   1.9757
1              Gross Margin        -        -  ...        -        -        -
2          Operating Margin        -        -  ...        -        -        -
3               EBIT Margin        -        -  ...        -        -        -
4             EBITDA Margin        -        -  ...        -        -        -
5     Pre-Tax Profit Margin   25.597  37.3674  ...  34.8895  31.2356  27.8019
6         Net Profit Margin  22.1984  29.6598  ...  19.8471  20.1488   18.078
7            Asset Turnover   0.0263    0.036  ...   0.0367   0.0366   0.0372
8  Inventory Turnover Ratio        -        -  ...        -        -        -

Thank you so much for the suggestion. I am still a newbie to Python, so definitely will lean more about functions and Panda now.
However, the main problem still remains for the code you wrote as well. The code does only get data upto 7 columns or upto the year 2015 but I need the data from 2020 to 2005. Could you suggest on how do I extract more data covering the entire table? Anyways thank you so very much for the help.

***snippsat*** · Jun-17-2021, 04:27 PM

(Jun-17-2021, 02:33 PM)AgileAVS Wrote: The code does only get data upto 7 columns or upto the year 2015 but I need the data from 2020 to 2005. Could you suggest on how do I extract more data covering the entire table? Anyways thank you so very much for the help.

The slider most move to activate more data,can show how to move and will probably need a slider loop to get all data.
This is of course a difficult and not so good way to get the data.
Many of theses stock sites has an API(search) then can get data in more normal format as JSON.
A good example of this is ALPHA VANTAGE.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from io import StringIO
import pandas as pd
pd.set_option('expand_frame_repr', False)

#--| Setup
options = Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1980,1020")
browser = webdriver.Chrome(executable_path=r'C:\cmder\bin\chromedriver.exe', options=options)
#--| Parse or automation
url = "https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios"
browser.get(url)
time.sleep(3)
# Find slider and move it
slider = browser.find_element_by_id('jqxScrollThumbhorizontalScrollBarjqxgrid')
ActionChains(browser).click_and_hold(slider).move_by_offset(350 , 10).release().perform()
time.sleep(3)
# Make a loop for 10 rows
lst = []
for number in range(1,11):
    row = browser.find_elements_by_css_selector(f'#row{number}jqxgrid')
    lst.append(row[0].text.replace('\n', ','))

# Make DataFrame
df = pd.read_csv(StringIO('\n'.join(lst)), sep=",")
print(df)
# Write to excel
#df.to_excel("output.xlsx", index=False)

Output:   Long-term Debt / Capital   0.4598   0.4489   0.4803  0.4997   0.5176  0.5377    0.618   0.6627  0.6545  0.6024   0.5736
0         Debt/Equity Ratio   1.9395    1.781   1.9757  2.2581   2.4808  2.8409   2.9697   3.6177  3.5804  3.8671   4.6823
1              Gross Margin        -        -        -       -        -       -        -        -       -       -        -
2          Operating Margin        -        -        -       -        -       -        -        -       -       -        -
3               EBIT Margin        -        -        -       -        -       -        -        -       -       -        -
4             EBITDA Margin        -        -        -       -        -       -        -        -       -       -        -
5     Pre-Tax Profit Margin  34.8895  31.2356  27.8019   9.523  18.9399   4.087  -0.2873  -1.6177  6.1345  9.6351  35.7993
6         Net Profit Margin  19.8471  20.1488   18.078  5.3529  11.8076  3.6719   0.1062  -4.4006  -3.101  5.4116  25.1369
7            Asset Turnover   0.0367   0.0366   0.0372  0.0397   0.0406   0.034   0.0376   0.0361  0.0319  0.0253   0.0341
8  Inventory Turnover Ratio        -        -        -       -        -       -        -        -       -       -        -

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	need help with xpath	pythonprogrammer	1	2,752	Jan-18-2020, 11:28 PM Last Post: snippsat

Dynamically changing Xpath

User Panel Messages

Announcements