Python Forum

Full Version: Python crawler reports errors for some Chinese characters
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I have a python crawler that downloads Baidu index data for the search keywords I provide. It works for most Chinese characters. When it comes to some characters, python shows the "invalid literal for int() with base 10: ''" error. I have checked several times and make sure there is no empty space, but the error remains.

The code:
import requests
import json
from datetime import date, timedelta
import pandas as pd

class DownloadBaiDuIndex(object):
    def __init__(self, cookie):
        self.cookie = cookie
        self.headers = {
            "Connection": "keep-alive",
            "Accept": "application/json, text/plain, */*",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Dest": "empty",
            "Referer": "https://index.baidu.com/v2/main/index.html",
            "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6,ja;q=0.5,ru;q=0.4",
            'Cookie': self.cookie,
            "Host": "index.baidu.com",
            "X-Requested-With": "XMLHttpRequest",
            "Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",

        }

    def decrypt(self, ptbk, index_data):
        n = len(ptbk) // 2
        a = dict(zip(ptbk[:n], ptbk[n:]))
        return "".join([a[s] for s in index_data])

    def get_index_data_json(self, keys, start=None, end=None):
        words = [[{"name":key, "wordType":1}] for key in keys]
        words = str(words).replace(" ", "").replace("'", "\"")

        url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}'
        print(words, start, end)
        res = requests.get(url, headers=self.headers)
        data = res.json()['data']
        uniqid = data['uniqid']
        url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
        res = requests.get(url, headers=self.headers)
        ptbk = res.json()['data']
        result = {}
        result["startDate"] = start
        result["endDate"] = end
        for userIndexe in data['userIndexes']:
            name = userIndexe['word'][0]['name']
            tmp = {}
            index_all = userIndexe['all']['data']
            index_all_data = [int(e) for e in self.decrypt(ptbk, index_all).split(",")]
            tmp["all"] = index_all_data
            index_pc = userIndexe['pc']['data']
            index_pc_data = [int(e) for e in self.decrypt(ptbk, index_pc).split(",")]
            tmp["pc"] = index_pc_data
            index_wise = userIndexe['wise']['data']
            index_wise_data = [int(e)
                               for e in self.decrypt(ptbk, index_wise).split(",")]
            tmp["wise"] = index_wise_data
            result[name] = tmp
        return result

    def GetIndex(self, keys, start=None, end=None):
        today = date.today()
        if start is None:
            start = str(today - timedelta(days=8))
        if end is None:
            end = str(today - timedelta(days=2))

        try:
            raw_data = self.get_index_data_json(keys=keys, start=start, end=end)
            raw_data = pd.DataFrame(raw_data[keys[0]])
            raw_data.index = pd.date_range(start=start, end=end)

        except Exception as e:
            print(e)
            raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})

        finally:
            return raw_data

# Baidu Index Crawler
appended_data = []
names = ["万科", "宝钢股份", "宁波华翔", "邯郸钢铁"]
cookie = 'PSTM=1493135429; BIDUPSID=4E6249E17CE020DD96051F17E859065E; MCITY=-:; __yjs_duid=1_f2bdcf71928819b290246dfb5ec1f88b1627719752767; BAIDUID=EDE52BAB587B1289E4CFE6876999D70E:FG=1; BDUSS=BFa0FJYXZuVVNUQUlDRC10Nmc4bVBGamIwU3NzZnRHfmp0ZGNKeHBUcWY4TlJpSVFBQUFBJCQAAAAAAAAAAAEAAABCpSWjeWxpdTMxNTA5MTIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJ9jrWKfY61iZV; ai_user=TU6P0S+hXPuepmudkzF3ab|2022-09-07T06:20:41.482Z; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1662531642; Hm_up_d101ea4d2a5c67dab98251f0b5de24dc={"uid_":{"value":"2737153346","scope":1}}; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=ah840l8lal818k052kakma061hhppa518; ZFY=haOdTA3raQNehkj7kdTtwgp2mbl7MGaNDvXqpzATSJ0:C; BAIDUID_BFESS=EDE52BAB587B1289E4CFE6876999D70E:FG=1; delPer=0; PSINO=7; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_PS_PSSID=36544_37117_37361_36885_37274_36806_36786_37244_37260_26350_37232; bdindexid=rn8uincmt59ojbfomaj3ijas86; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04129204022rfXD6ExmZPpUJhdFgUNKL4Qf58vyWFT2S9WLV9yYMYQuaAJmU3S1E39ned8eaf1lkOvzmrGB1ag+Jlrwf6nqg7TMyOdUIrHPpXAZtXo0rHsdOrEqtHM/bZHpdWofRX9/yLcabgvvrvlMoQqIlWgvw+ONVPggUTw1e2w+bt/EwSzYDQ1Yg467d2OHikWxK4pp63uhKDengOWFOCsY+vk7ptdZXsAvh2eijgfMNHfhdQq/zVIH9NFTRhT+pgZIC93eFJsp67DzV7YvSjinm0rZhA==60986356220794518584054979528195; __cas__rn__=412920402; __cas__st__212=10292876e9f34179535097291f4887e79777d53e349fc22b264d7913546801e104d3983195722c1cbba597de; __cas__id__212=42237265; CPID_212=42237265; CPTK_212=428319642; ab_sr=1.0.1_YTBlMzg0ZGQ2N2Y2NDUwZGM0OTUyOGZiYTRiYzg2NTE5ZTY5OWVkZjExY2Q2YTExZmJkMjFjMTZiMjVhN2E2ODkwYzE4MjJiZDU4ZGZiOGU3YWYwYmM3NmVkOWVkOWIyNGVmZWQ5YTFkNjM2NzU5OWJmMGJiOTZkZjAwNTVkMzUyNmMyNWEyMDIxZmYwZThiMmEwZGMwODgzODE3OTRiYQ==; RT="z=1&dm=baidu.com&si=c706899d-6871-43a3-8e6d-2c363b5d090b&ss=l7wbr8mm&sl=0&tt=0&bcn=https://fclog.baidu.com/log/weirwood?type=perf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1662907967; BDUSS_BFESS=BFa0FJYXZuVVNUQUlDRC10Nmc4bVBGamIwU3NzZnRHfmp0ZGNKeHBUcWY4TlJpSVFBQUFBJCQAAAAAAAAAAAEAAABCpSWjeWxpdTMxNTA5MTIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJ9jrWKfY61iZV'
downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
for name in names:
    data = downloadbaiduindex.GetIndex(keys=[name], start='2020-01-01', end='2020-12-31')
    data['Ticker'] = name
    appended_data.append(data)

appended_data = pd.concat(appended_data)
appended_data.to_csv('Baidu_Index.csv')
For example, in the name list, the first two "万科", "宝钢股份" work but the last two "宁波华翔", "邯郸钢铁" don't.

It would be really appreciated if someone could help! Thanks!