Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Dynamically changing Xpath
#1
Hey I have been trying to extract data from table from this website:
https://www.macrotrends.net/stocks/chart...ial-ratios

but the problem is that every time the page refreshes, the full Xpath also changes. Besides, I have also tried extracting with Panda and BeautifulSoup, I did fail with the last two approaches as I mostly use Selenium only. Here's the code I have written using modules: Selenium and XLS Writer. While the code works, the data is misplaced everytime I run the code as the website dynamically changes the Xpath.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import xlsxwriter

workbook = xlsxwriter.Workbook('kr_1_BAC.xlsx')
worksheet = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})
money_format = workbook.add_format({'num_format': '$#,##0'})
worksheet.write('A1', 'Particulars', bold)
worksheet.write('B1', '2020', bold)
worksheet.write('C1', '2019', bold)
worksheet.write('D1', '2018', bold)
worksheet.write('E1', '2017', bold)
worksheet.write('F1', '2016', bold)
worksheet.write('G1', '2015', bold)
worksheet.write('H1', '2014', bold)
worksheet.write('I1', '2013', bold)
worksheet.write('J1', '2012', bold)
worksheet.write('K1', '2011', bold)
worksheet.write('L1', '2010', bold)
worksheet.write('M1', '2009', bold)
worksheet.write('N1', '2008', bold)
worksheet.write('O1', '2007', bold)
worksheet.write('P1', '2006', bold)
worksheet.write('Q1', '2005', bold)

options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
#options.add_argument('-headless')
browser = webdriver.Chrome(options=options)
#browser = webdriver.Chrome()
#options.headless = True

url = 'https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios'
browser.get(url)

#Creating Lists:
list_ind=[]
list_yr1=[]
list_yr2=[]
list_yr3=[]
list_yr4=[]
list_yr5=[]
list_yr6=[]
list_yr7=[]
list_yr8=[]
list_yr9=[]
list_yr10=[]
list_yr11=[]
list_yr12=[]
list_yr13=[]
list_yr14=[]
list_yr15=[]
list_yr16=[]

#from 2020-2015:
for n1 in range(1,21):
    ind=browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[1]/div')
    ind=ind.text
    y1 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[8]/div')
    y1 = y1.text
    y2 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[7]/div')
    y2 = y2.text
    y3 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[5]/div')
    y3 = y3.text
    y4 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[6]/div')
    y4 = y4.text
    y5 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[4]/div')
    y5 = y5.text
    y6 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n1}]/div[3]/div')
    y6 = y6.text
    indd = str(ind)
    y11 = str(y1)
    y21 = str(y2)
    y31 = str(y3)
    y41 = str(y4)
    y51 = str(y5)
    y61 = str(y6)
    list_ind.append(indd)
    list_yr1.append(y11)
    list_yr2.append(y21)
    list_yr3.append(y31)
    list_yr4.append(y41)
    list_yr5.append(y51)
    list_yr6.append(y61)
#from 2014-12:
#Clicking on the right button 33 times to scroll the table to the right as the xpath are named the same
m=33
while m>0:
    element_to_hover_over1 = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div/div/div[6]/div/div/div[5]/div')
    hover = ActionChains(browser).move_to_element(element_to_hover_over1).click().perform()
    m=m-1
    print('m',m)
    #hover1 = ActionChains(browser).click
    #time.sleep(1)
    continue
    #Back to extacting data
for n2 in range(1, 21):
    y7 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n2}]/div[9]/div')
    y7 = y7.text
    y8 = browser.find_element_by_xpath(f'//html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n2}]/div[10]/div')
    y8 = y8.text
    y9 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n2}]/div[8]/div')
    y9 = y9.text
    y71 = str(y7)
    y81 = str(y8)
    y91 = str(y9)
    list_yr7.append(y71)
    list_yr8.append(y81)
    list_yr9.append(y91)
time.sleep(3)
#Moving right by clicking on the right button for 35 times
m2=35
while m2>0:
    element_to_hover_over_r = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div/div/div[6]/div/div/div[5]/div')
    hover = ActionChains(browser).move_to_element(element_to_hover_over_r).click().perform()
    m2=m2-1
    print('m',m2)
    #hover1 = ActionChains(browser).click
    #time.sleep(1)
    continue
time.sleep(2)
#2011 only
for n3 in range(1, 21):
    y10 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n3}]/div[9]/div')
    y10 = y10.text
    y101 = str(y10)
    list_yr10.append(y101)
time.sleep(3)
#From 2010-2005/ Scrolling all the way to right
hover5 = ActionChains(browser).click_and_hold(on_element=element_to_hover_over_r).perform()
#time.sleep(1)
for n4 in range(1, 21):
    y11 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[7]/div')
    y11 = y11.text
    y12 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[8]/div')
    y12 = y12.text
    y13 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[9]/div')
    y13 = y13.text
    y14 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[10]/div')
    y14 = y14.text
    y15 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[11]/div')
    y15 = y15.text
    y16 = browser.find_element_by_xpath(f'/html/body/div[2]/div[3]/div[3]/div/div/div[4]/div[2]/div/div[{n4}]/div[12]/div')
    y16 = y16.text
    y111=str(y11)
    y121=str(y12)
    y131=str(y13)
    y141=str(y14)
    y151=str(y15)
    y161=str(y16)
    list_yr11.append(y111)
    list_yr12.append(y121)
    list_yr13.append(y131)
    list_yr14.append(y141)
    list_yr15.append(y151)
    list_yr16.append(y161)
#Counting for XLS writer:
n= len(list_ind)
# Concluding Excel:
row = 1
col = 0
for item1 in (list_ind):
    worksheet.write(row, col, item1)
    row += 1
for item2 in (list_yr1):
    worksheet.write(row-(n*1), col + 1, item2)
    row += 1
for item3 in (list_yr2):
    worksheet.write(row-(n*2), col + 2, item3)
    row += 1
for item4 in (list_yr3):
    worksheet.write(row -(n*3) , col + 3, item4)
    row += 1
for item5 in (list_yr4):
    worksheet.write(row - (n * 4), col + 4, item5)
    row += 1
for item6 in (list_yr5):
    worksheet.write(row - (n * 5), col + 5, item6)
    row += 1
for item7 in (list_yr6):
    worksheet.write(row - (n * 6), col + 6, item7)
    row += 1
for item8 in (list_yr7):
    worksheet.write(row - (n * 7), col + 7, item8)
    row += 1
for item9 in (list_yr8):
    worksheet.write(row - (n * 8), col + 8, item9)
    row += 1
for item10 in (list_yr9):
    worksheet.write(row - (n * 9), col + 9, item10)
    row += 1
for item11 in (list_yr10):
    worksheet.write(row - (n * 10), col + 10, item11)
    row += 1
for item12 in (list_yr11):
    worksheet.write(row - (n * 11), col + 11, item12)
    row += 1
for item13 in (list_yr12):
    worksheet.write(row - (n * 12), col + 12, item13)
    row += 1
for item14 in (list_yr13):
    worksheet.write(row - (n * 13), col + 13, item14)
    row += 1
for item15 in (list_yr14):
    worksheet.write(row - (n * 14), col + 14, item15)
    row += 1
for item16 in (list_yr15):
    worksheet.write(row - (n * 15), col + 15, item16)
    row += 1
for item17 in (list_yr16):
    worksheet.write(row - (n * 16), col + 16, item17)
    row += 1
workbook.close()
browser.quit()

Attached Files

.py   Key Ratios.py (Size: 7.62 KB / Downloads: 267)
Reply
#2
Your code what can i say could be a lot better.
There are many loops,but a many are unnecessary and code still repeating to much in the loops🥨
When code start to get over 200 lines or a lot less,it's about time to think some structure like eg function.

Can giver you some tips and using Pandas is for me an better an easier option than using xlsxwriter.
In Pandas you get similar look that Excel when finish then is just to use df.to_excel().
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from io import StringIO
import pandas as pd

#--| Setup
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(executable_path=r'C:\cmder\bin\chromedriver.exe', options=options)
#--| Parse or automation
url = "https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios"
browser.get(url)
time.sleep(3)

# Make a loop for 10 rows
lst = []
for number in range(1,11):
    row = browser.find_elements_by_css_selector(f'#row{number}jqxgrid')
    lst.append(row[0].text.replace('\n', ','))

# Make DataFrame
df = pd.read_csv(StringIO('\n'.join(lst)), sep=",")
print(df)
# Write to excel
df.to_excel("output.xlsx", index=False)
Output:
Long-term Debt / Capital 0.4907 0.4763 ... 0.4598 0.4489 0.4803 0 Debt/Equity Ratio 1.9196 1.9389 ... 1.9395 1.781 1.9757 1 Gross Margin - - ... - - - 2 Operating Margin - - ... - - - 3 EBIT Margin - - ... - - - 4 EBITDA Margin - - ... - - - 5 Pre-Tax Profit Margin 25.597 37.3674 ... 34.8895 31.2356 27.8019 6 Net Profit Margin 22.1984 29.6598 ... 19.8471 20.1488 18.078 7 Asset Turnover 0.0263 0.036 ... 0.0367 0.0366 0.0372 8 Inventory Turnover Ratio - - ... - - -
[Image: qXbKs3.png]
Reply
#3
(Jun-17-2021, 12:57 PM)snippsat Wrote: Your code what can i say could be a lot better.
There are many loops,but a many are unnecessary and code still repeating to much in the loops🥨
When code start to get over 200 lines or a lot less,it's about time to think some structure like eg function.

Can giver you some tips and using Pandas is for me an better an easier option than using xlsxwriter.
In Pandas you get similar look that Excel when finish then is just to use df.to_excel().
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from io import StringIO
import pandas as pd

#--| Setup
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(executable_path=r'C:\cmder\bin\chromedriver.exe', options=options)
#--| Parse or automation
url = "https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios"
browser.get(url)
time.sleep(3)

# Make a loop for 10 rows
lst = []
for number in range(1,11):
    row = browser.find_elements_by_css_selector(f'#row{number}jqxgrid')
    lst.append(row[0].text.replace('\n', ','))

# Make DataFrame
df = pd.read_csv(StringIO('\n'.join(lst)), sep=",")
print(df)
# Write to excel
df.to_excel("output.xlsx", index=False)
Output:
Long-term Debt / Capital 0.4907 0.4763 ... 0.4598 0.4489 0.4803 0 Debt/Equity Ratio 1.9196 1.9389 ... 1.9395 1.781 1.9757 1 Gross Margin - - ... - - - 2 Operating Margin - - ... - - - 3 EBIT Margin - - ... - - - 4 EBITDA Margin - - ... - - - 5 Pre-Tax Profit Margin 25.597 37.3674 ... 34.8895 31.2356 27.8019 6 Net Profit Margin 22.1984 29.6598 ... 19.8471 20.1488 18.078 7 Asset Turnover 0.0263 0.036 ... 0.0367 0.0366 0.0372 8 Inventory Turnover Ratio - - ... - - -
[Image: qXbKs3.png]

Thank you so much for the suggestion. I am still a newbie to Python, so definitely will lean more about functions and Panda now.
However, the main problem still remains for the code you wrote as well. The code does only get data upto 7 columns or upto the year 2015 but I need the data from 2020 to 2005. Could you suggest on how do I extract more data covering the entire table? Anyways thank you so very much for the help.
Reply
#4
(Jun-17-2021, 02:33 PM)AgileAVS Wrote: The code does only get data upto 7 columns or upto the year 2015 but I need the data from 2020 to 2005. Could you suggest on how do I extract more data covering the entire table? Anyways thank you so very much for the help.
The slider most move to activate more data,can show how to move and will probably need a slider loop to get all data.
This is of course a difficult and not so good way to get the data.
Many of theses stock sites has an API(search) then can get data in more normal format as JSON.
A good example of this is ALPHA VANTAGE.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from io import StringIO
import pandas as pd
pd.set_option('expand_frame_repr', False)

#--| Setup
options = Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1980,1020")
browser = webdriver.Chrome(executable_path=r'C:\cmder\bin\chromedriver.exe', options=options)
#--| Parse or automation
url = "https://www.macrotrends.net/stocks/charts/BAC/bank-of-america/financial-ratios"
browser.get(url)
time.sleep(3)
# Find slider and move it
slider = browser.find_element_by_id('jqxScrollThumbhorizontalScrollBarjqxgrid')
ActionChains(browser).click_and_hold(slider).move_by_offset(350 , 10).release().perform()
time.sleep(3)
# Make a loop for 10 rows
lst = []
for number in range(1,11):
    row = browser.find_elements_by_css_selector(f'#row{number}jqxgrid')
    lst.append(row[0].text.replace('\n', ','))

# Make DataFrame
df = pd.read_csv(StringIO('\n'.join(lst)), sep=",")
print(df)
# Write to excel
#df.to_excel("output.xlsx", index=False)
Output:
Long-term Debt / Capital 0.4598 0.4489 0.4803 0.4997 0.5176 0.5377 0.618 0.6627 0.6545 0.6024 0.5736 0 Debt/Equity Ratio 1.9395 1.781 1.9757 2.2581 2.4808 2.8409 2.9697 3.6177 3.5804 3.8671 4.6823 1 Gross Margin - - - - - - - - - - - 2 Operating Margin - - - - - - - - - - - 3 EBIT Margin - - - - - - - - - - - 4 EBITDA Margin - - - - - - - - - - - 5 Pre-Tax Profit Margin 34.8895 31.2356 27.8019 9.523 18.9399 4.087 -0.2873 -1.6177 6.1345 9.6351 35.7993 6 Net Profit Margin 19.8471 20.1488 18.078 5.3529 11.8076 3.6719 0.1062 -4.4006 -3.101 5.4116 25.1369 7 Asset Turnover 0.0367 0.0366 0.0372 0.0397 0.0406 0.034 0.0376 0.0361 0.0319 0.0253 0.0341 8 Inventory Turnover Ratio - - - - - - - - - - -
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  need help with xpath pythonprogrammer 1 2,752 Jan-18-2020, 11:28 PM
Last Post: snippsat

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020