Dec-19-2022, 11:11 PM
(Python) Pulling data from UA Google Analytics with more than 100k rows into csv. Works fine for 57k
My code just keeps outputting: len(rows): 100000
100000
None
len(rows): 200000
100000
None
len(rows): 300000
100000
None
len(rows): 400000
100000
None
len(rows): 500000
100000
None
looping forever rather than pulling the next page.
I appreciate I'm not asking very well, but what basic thing have I missed to make it pull the next page and append?
import pandas as pd from apiclient.discovery import build from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime import datetime as datetime2 from datetime import date, timedelta import time from dates import StartDay, StartMonth, StartYear, EndDay,EndYear,EndMonth SCOPES = ['https://www.googleapis.com/auth/analytics.readonly'] KEY_FILE_LOCATION = './XXXXXXX.json' VIEW_ID = 'XXXXXXX' PAGESIZE = 100000 #StartDate = '2022-10-13' #EndDate = '2022-10-14' #RangeStart = datetime2.date(2022, 9, 1) #RangeEnd = datetime2.date(2022, 9, 1) #RangeStart = datetime2.date(2022, 1, 3) # from and inluding this date #RangeEnd = datetime2.date(2022, 1, 4) # up to this date (not including) RangeStart = datetime2.date(StartYear, StartMonth, StartDay) # from and inluding this date RangeEnd = datetime2.date(EndYear, EndMonth, EndDay) # up to this date (not including) delta = datetime2.timedelta(days=1) #main.RangeStart def initialize_analyticsreporting(): credentials = ServiceAccountCredentials.from_json_keyfile_name( KEY_FILE_LOCATION, SCOPES) analytics = build('analyticsreporting', 'v4', credentials=credentials) return analytics # Get one report page def get_report(analytics, pageTokenVar): return analytics.reports().batchGet( body={ ################################################################################## 'reportRequests': [ { 'viewId': VIEW_ID, 'dateRanges': [{'startDate': startingDateFeed, 'endDate': singleDateFeed}], 'metrics': [ {'expression': 'ga:users'}, {'expression': 'ga:newUsers'}, #{'expression': 'ga:pageviews'}, #{'expression': 'ga:bounces'}, #{'expression': 'ga:sessions'}, #{'expression': 'ga:searchUniques'}, #{'expression': 'ga:timeOnPage'} {'expression': 'ga:searchUniques'}, {'expression': 'ga:bounces'}, {'expression': 'ga:bounceRate'}, {'expression': 'ga:sessionDuration'}, {'expression': 'ga:avgSessionDuration'}, {'expression': 'ga:avgTimeOnPage'}, {'expression': 'ga:organicSearches'} ], 'dimensions': [ {'name': 'ga:userType'}, {'name': 'ga:sessionCount'}, {'name': 'ga:dateHourMinute'}, #{'name': 'ga:sourceMedium'}, #{'name': 'ga:deviceCategory'}, #{'name': 'ga:cityId'}, #{'name': 'ga:countryIsoCode'}, #{'name': 'ga:landingPagePath'}, #{'name': 'ga:pagePath'}, {'name': 'ga:campaign'}, #{'name' : 'ga:searchUniques'} #{'name': 'ga:pagePathLevel1'}, #{'name': 'ga:pagePathLevel2'}, #{'name': 'ga:pagePathLevel3'}, #{'name': 'ga:pagePathLevel4'} ], 'samplingLevel': 'LARGE', 'pageSize': PAGESIZE }] ################################################################################## } ).execute() def handle_report(analytics, pagetoken, rows): response = get_report(analytics, pagetoken) # Header, Dimentions Headers, Metric Headers columnHeader = response.get("reports")[0].get('columnHeader', {}) dimensionHeaders = columnHeader.get('dimensions', []) metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', []) # Pagination pagetoken = response.get("reports")[0].get('nextPageToken', None) # Rows rowsNew = response.get("reports")[0].get('data', {}).get('rows', []) rows = rows + rowsNew print("len(rows): " + str(len(rows))) print(pagetoken) print(response.get('nextPageToken')) #print(response.get('rows', [])) # Recursivly query next page if pagetoken != None: return handle_report(analytics, pagetoken, rows) else: # nicer results nicerows = [] for row in rows: dic = {} dimensions = row.get('dimensions', []) dateRangeValues = row.get('metrics', []) for header, dimension in zip(dimensionHeaders, dimensions): dic[header] = dimension for i, values in enumerate(dateRangeValues): for metric, value in zip(metricHeaders, values.get('values')): if ',' in value or ',' in value: dic[metric.get('name')] = float(value) else: dic[metric.get('name')] = float(value) nicerows.append(dic) return nicerows # Start def main(): #while (RangeStart <= RangeEnd): analytics = initialize_analyticsreporting() global dfanalytics dfanalytics = [] rows = [] rows = handle_report(analytics, '0', rows) dfanalytics = pd.DataFrame(list(rows)) YYYYmmddHHMMSS = datetime.today().strftime('%Y%m%d%H%M%S') dfanalytics.to_csv("./outputRAW/behaviour28/behaviour28days"+singleDateFeed+"_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS)) dfanalytics.to_csv("./outputRAW/behaviour28/backupCopy/behaviour28days" + singleDateFeed + "_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS)) #RangeStart += delta #if __name__ == '__main__': # main() def daterange(RangeStart,RangeEnd): for n in range(int((RangeEnd - RangeStart).days)): yield RangeStart + timedelta(n) for singleDate in daterange(RangeStart,RangeEnd): #print(singleDate.strftime("%Y-%m-%d")) singleDateMinusSeven = singleDate + timedelta(days=-7) #print(singleDate + timedelta(days=-7)) singleDateFeed = singleDate.strftime("%Y-%m-%d") startingDateFeed = singleDateMinusSeven.strftime("%Y-%m-%d") #endingDateFeed = startingDateFeed + timedelta(days=-7) #endingDateFeed = singleDate.strftime("%Y-%m-%d") + timedelta(days=-7) main() print(startingDateFeed) print(singleDateFeed) #print(endingDateFeed) #time.sleep(3) #Sleep for a number of secoonds before running the next day - to decrease number of api calls #dateTest = datetime2.date(2022, 2, 1) #print (dateTest, end="\n")Help!
My code just keeps outputting: len(rows): 100000
100000
None
len(rows): 200000
100000
None
len(rows): 300000
100000
None
len(rows): 400000
100000
None
len(rows): 500000
100000
None
looping forever rather than pulling the next page.
I appreciate I'm not asking very well, but what basic thing have I missed to make it pull the next page and append?
#Recursivly query next page if pagetoken != None: return handle_report(analytics, pagetoken, rows)It's this bit that doesn't seem to work This runs fine when there are 57k rows, but when I expand the range, the above happens. Many thanks in advance, and thank you for your patience!