Sep-11-2022, 06:17 PM
I have a python crawler that downloads Baidu index data for the search keywords I provide. It works for most Chinese characters. When it comes to some characters, python shows the "invalid literal for int() with base 10: ''" error. I have checked several times and make sure there is no empty space, but the error remains.
The code:
It would be really appreciated if someone could help! Thanks!
The code:
import requests import json from datetime import date, timedelta import pandas as pd class DownloadBaiDuIndex(object): def __init__(self, cookie): self.cookie = cookie self.headers = { "Connection": "keep-alive", "Accept": "application/json, text/plain, */*", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://index.baidu.com/v2/main/index.html", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6,ja;q=0.5,ru;q=0.4", 'Cookie': self.cookie, "Host": "index.baidu.com", "X-Requested-With": "XMLHttpRequest", "Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==", } def decrypt(self, ptbk, index_data): n = len(ptbk) // 2 a = dict(zip(ptbk[:n], ptbk[n:])) return "".join([a[s] for s in index_data]) def get_index_data_json(self, keys, start=None, end=None): words = [[{"name":key, "wordType":1}] for key in keys] words = str(words).replace(" ", "").replace("'", "\"") url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}' print(words, start, end) res = requests.get(url, headers=self.headers) data = res.json()['data'] uniqid = data['uniqid'] url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}' res = requests.get(url, headers=self.headers) ptbk = res.json()['data'] result = {} result["startDate"] = start result["endDate"] = end for userIndexe in data['userIndexes']: name = userIndexe['word'][0]['name'] tmp = {} index_all = userIndexe['all']['data'] index_all_data = [int(e) for e in self.decrypt(ptbk, index_all).split(",")] tmp["all"] = index_all_data index_pc = userIndexe['pc']['data'] index_pc_data = [int(e) for e in self.decrypt(ptbk, index_pc).split(",")] tmp["pc"] = index_pc_data index_wise = userIndexe['wise']['data'] index_wise_data = [int(e) for e in self.decrypt(ptbk, index_wise).split(",")] tmp["wise"] = index_wise_data result[name] = tmp return result def GetIndex(self, keys, start=None, end=None): today = date.today() if start is None: start = str(today - timedelta(days=8)) if end is None: end = str(today - timedelta(days=2)) try: raw_data = self.get_index_data_json(keys=keys, start=start, end=end) raw_data = pd.DataFrame(raw_data[keys[0]]) raw_data.index = pd.date_range(start=start, end=end) except Exception as e: print(e) raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []}) finally: return raw_data # Baidu Index Crawler appended_data = [] names = ["万科", "宝钢股份", "宁波华翔", "邯郸钢铁"] cookie = 'PSTM=1493135429; BIDUPSID=4E6249E17CE020DD96051F17E859065E; MCITY=-:; __yjs_duid=1_f2bdcf71928819b290246dfb5ec1f88b1627719752767; BAIDUID=EDE52BAB587B1289E4CFE6876999D70E:FG=1; BDUSS=BFa0FJYXZuVVNUQUlDRC10Nmc4bVBGamIwU3NzZnRHfmp0ZGNKeHBUcWY4TlJpSVFBQUFBJCQAAAAAAAAAAAEAAABCpSWjeWxpdTMxNTA5MTIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJ9jrWKfY61iZV; ai_user=TU6P0S+hXPuepmudkzF3ab|2022-09-07T06:20:41.482Z; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1662531642; Hm_up_d101ea4d2a5c67dab98251f0b5de24dc={"uid_":{"value":"2737153346","scope":1}}; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=ah840l8lal818k052kakma061hhppa518; ZFY=haOdTA3raQNehkj7kdTtwgp2mbl7MGaNDvXqpzATSJ0:C; BAIDUID_BFESS=EDE52BAB587B1289E4CFE6876999D70E:FG=1; delPer=0; PSINO=7; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_PS_PSSID=36544_37117_37361_36885_37274_36806_36786_37244_37260_26350_37232; bdindexid=rn8uincmt59ojbfomaj3ijas86; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04129204022rfXD6ExmZPpUJhdFgUNKL4Qf58vyWFT2S9WLV9yYMYQuaAJmU3S1E39ned8eaf1lkOvzmrGB1ag+Jlrwf6nqg7TMyOdUIrHPpXAZtXo0rHsdOrEqtHM/bZHpdWofRX9/yLcabgvvrvlMoQqIlWgvw+ONVPggUTw1e2w+bt/EwSzYDQ1Yg467d2OHikWxK4pp63uhKDengOWFOCsY+vk7ptdZXsAvh2eijgfMNHfhdQq/zVIH9NFTRhT+pgZIC93eFJsp67DzV7YvSjinm0rZhA==60986356220794518584054979528195; __cas__rn__=412920402; __cas__st__212=10292876e9f34179535097291f4887e79777d53e349fc22b264d7913546801e104d3983195722c1cbba597de; __cas__id__212=42237265; CPID_212=42237265; CPTK_212=428319642; ab_sr=1.0.1_YTBlMzg0ZGQ2N2Y2NDUwZGM0OTUyOGZiYTRiYzg2NTE5ZTY5OWVkZjExY2Q2YTExZmJkMjFjMTZiMjVhN2E2ODkwYzE4MjJiZDU4ZGZiOGU3YWYwYmM3NmVkOWVkOWIyNGVmZWQ5YTFkNjM2NzU5OWJmMGJiOTZkZjAwNTVkMzUyNmMyNWEyMDIxZmYwZThiMmEwZGMwODgzODE3OTRiYQ==; RT="z=1&dm=baidu.com&si=c706899d-6871-43a3-8e6d-2c363b5d090b&ss=l7wbr8mm&sl=0&tt=0&bcn=https://fclog.baidu.com/log/weirwood?type=perf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1662907967; BDUSS_BFESS=BFa0FJYXZuVVNUQUlDRC10Nmc4bVBGamIwU3NzZnRHfmp0ZGNKeHBUcWY4TlJpSVFBQUFBJCQAAAAAAAAAAAEAAABCpSWjeWxpdTMxNTA5MTIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJ9jrWKfY61iZV' downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie) for name in names: data = downloadbaiduindex.GetIndex(keys=[name], start='2020-01-01', end='2020-12-31') data['Ticker'] = name appended_data.append(data) appended_data = pd.concat(appended_data) appended_data.to_csv('Baidu_Index.csv')For example, in the name list, the first two "万科", "宝钢股份" work but the last two "宁波华翔", "邯郸钢铁" don't.
It would be really appreciated if someone could help! Thanks!