May-01-2020, 04:45 PM
I want to scrap payment details from EPFO site but unable to apply post request for same. Any help will be appreciated.
import os from bs4 import BeautifulSoup import cv2 import json from imutils import paths import requests from captcha_breaker import captcha_breaker from urllib.request import urlretrieve def establishment_data_from_epfo(): get_headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'} url = "https://unifiedportal-epfo.epfindia.gov.in/publicPortal/no-auth/misReport/home/loadEstSearchHome" estNames=['Abb India Limited', 'Aegis Logistics Limited', 'Amara Raja Batteries Limited', 'Ambalal Sarabhai Enterprises Limited', 'Housing Development Finance Corporation Limited' ] for estName in estNames: try: # get request for main urls r = requests.get(url,headers=get_headers,stream=True) if r.status_code == 200: soup = BeautifulSoup(r.content, 'html.parser') try: post_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Content-Type': 'application/json; charset=utf-8', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://unifiedportal-epfo.epfindia.gov.in', 'Connection': 'keep-alive', 'Referer': 'https://unifiedportal-epfo.epfindia.gov.in/publicPortal/no-auth/misReport/home/loadEstSearchHome', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', } # to find captcha image src src = "https://unifiedportal-epfo.epfindia.gov.in"+soup.find("img", {"id": "capImg"}).get('src').split('?')[0]+'.png' print("Capcha src:",src) urlretrieve(src, "Captcha/{}".format("captcha.png")) image_file = list(paths.list_images("Captcha")) # to display captcha image """ image=cv2.imread("Captcha/captcha.png") cv2.imshow("Captcha",image) cv2.waitKey(0) """ captcha_text = captcha_breaker(image_file[0]) print("Captcha text is",captcha_text) # to find post request url form = soup.find("form", {"id": "employerSearchForm"}) est_search = form.find("input", {"id": "searchEmployer"}).get("onclick") url_href = est_search.split("'")[1] if not url_href.startswith('https'): action_url = "https://unifiedportal-epfo.epfindia.gov.in" + str(url_href) print("Post URL is",action_url) # post request data data={ "EstName":"{0}".format(estName), "EstCode":"", "captcha":"{0}".format(captcha_text), } # post request for actual data # this is question that i want to ask res = requests.post(action_url,headers=post_headers,data=data) if res.status_code == 200: soup_data = BeautifulSoup(res.content, 'html.parser') try: table=soup_data.find("table",{"id":"example"}) print(table) # it should return table but returning None( it return error page) except Exception as e: print(e) elif res.status_code == 404: print('data is not found due to 404 error') else: print('data is not found due to other status code') except Exception as e: print('data is not found.',e) elif r.status_code == 404: print('data is not found due to 404 error') else: print('data is not found due to other status code') except requests.ConnectionError as ce: print("There is a network problem (DNS Failure, refused connection etc.). Error : ",ce) establishment_data_from_epfo()