Python Forum

Full Version: Scraping Data issues
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I'm trying to scrape data from a game's market and use the data to track it.

So I have the scraper log in, it goes and pulls some items on the market and stores them in a table.

Now I'm trying to have it use those stored items in the table, to compare it to similar items being sold on the market. I have it so after it stores each item in the table, it goes and checks for the item id on each item, then looks them up.

When it goes to look them up again, it is making me log into the game again. Is this because it isn't passing cookies? Also, am I doing all this correctly? I want it to be optimized as best I can.

Here is my code:
from bs4 import BeautifulSoup
from lxml import html
import requests
import re
from collections import defaultdict

# Start the session
session = requests.Session()

market_items = defaultdict(dict)
compare_prices = defaultdict(dict)
# Create the payload
username = "EMAIL"
password = "PASSWORD"
authenticity_token = 0

LOGIN_URL = "https://web.simple-mmo.com/login"
URL = "https://web.simple-mmo.com/market/collectables/all"
PriceURL = "https://web.simple-mmo.com/market/all/all"




def findAveragePrice():
	session_requests = requests.session()
	key_items = market_items.items()
	t = len(key_items)
	for key in market_items:
		lookID = market_items[key]["ID"]
		print(lookID)
		payload = {
			"itemid": lookID,
			"new_page": "true",
			"_token": authenticity_token
		}
		result = session_requests.get(PriceURL, data = payload, headers = dict(referer = PriceURL))
		soup = BeautifulSoup(result.content, 'html.parser')
		print(result.text)
		#print(market_items[key]["PRICE"])
		
		pricematch = soup.find_all('div', class_='individual-item')
		print(pricematch)
		for match in pricematch:
			x = match.find('a')['onclick']
			title = x.split("retrieveMarketItem(")[1].strip().split(')')[0]
			ITEMID = title.split(",")[0].lstrip()
			RdmNum = title.split(",'")[1].lstrip().split("'")[0]
			price = title.split(" '")[1].lstrip().split("',")[0]
			player = title.split(" '")[2].lstrip().split("'")[0]
			time = title.split(" '")[3].lstrip().split("'")[0]
			
			compare_prices[key]["PRICE"]
			
			print(compare_prices)
			
			
		
		
		
		
def main():
	session_requests = requests.session()
	
	result = session_requests.get(LOGIN_URL)
	tree = html.fromstring(result.text)
	authenticity_token = list(set(tree.xpath("//input[@name='_token']/@value")))[0]
	
	payload = {
		"email": username,
		"password": password,
		"_token": authenticity_token
	}
	
	

	result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))

	result = session_requests.get(URL, headers = dict(referer = URL))
	tree = html.fromstring(result.content)
	bucket_names = tree.xpath("//div[@class='individual-item']/span/text()")
	
	soup = BeautifulSoup(result.content, 'html.parser')
	#print(soup)
	
	collectables = soup.find_all('div', class_='individual-item')
	

	for collectable in collectables:
		x = collectable.find('a')['onclick']
		#print(x)
		
		title = x.split("retrieveMarketItem(")[1].strip().split(')')[0]
		ITEMID = title.split(",")[0].lstrip()
		RdmNum = title.split(",'")[1].lstrip().split("'")[0]
		price = title.split(" '")[1].lstrip().split("',")[0]
		player = title.split(" '")[2].lstrip().split("'")[0]
		time = title.split(" '")[3].lstrip().split("'")[0]
		
		#print(title)
		#print("ID: " + ITEMID)
		#print("UniqueID: " + RdmNum)
		#print("PRICE: " + price)
		#print("PLAYER: " + player)
		#print("TIME: " + time)
		
		#market_items["UniqueID"] = RdmNum
		market_items[RdmNum]["ID"] = ITEMID
		market_items[RdmNum]["PRICE"] = price
		market_items[RdmNum]["SELLER"] = player
		market_items[RdmNum]["TIME"] = time
		
		#print(market_items)
		
	findAveragePrice()	
		

if __name__ == '__main__':
    main()	
	
So right now, all I'm getting returned is the login page. I think its because cookies aren't being passed through? But Im not sure how to fix that.
If problem is only in session then read docs https://requests.readthedocs.io/en/maste.../advanced/