Python Forum

Full Version: (Python) Pulling data from UA Google Analytics with more than 100k rows into csv.
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
(Python) Pulling data from UA Google Analytics with more than 100k rows into csv. Works fine for 57k

import pandas as pd
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
import datetime as datetime2
from datetime import date, timedelta
import time
from dates import StartDay, StartMonth, StartYear, EndDay,EndYear,EndMonth

SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = './XXXXXXX.json'
VIEW_ID = 'XXXXXXX'
PAGESIZE = 100000

#StartDate = '2022-10-13'
#EndDate = '2022-10-14'

#RangeStart = datetime2.date(2022, 9, 1)
#RangeEnd = datetime2.date(2022, 9, 1)
#RangeStart = datetime2.date(2022, 1, 3) # from and inluding this date
#RangeEnd = datetime2.date(2022, 1, 4) # up to this date (not including)


RangeStart = datetime2.date(StartYear, StartMonth, StartDay) # from and inluding this date
RangeEnd = datetime2.date(EndYear, EndMonth, EndDay) # up to this date (not including)

delta = datetime2.timedelta(days=1)

#main.RangeStart

def initialize_analyticsreporting():
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        KEY_FILE_LOCATION, SCOPES)
    analytics = build('analyticsreporting', 'v4', credentials=credentials)
    return analytics


# Get one report page
def get_report(analytics, pageTokenVar):
    return analytics.reports().batchGet(
        body={

            ##################################################################################

            'reportRequests': [
                {
                    'viewId': VIEW_ID,
                    'dateRanges': [{'startDate': startingDateFeed, 'endDate': singleDateFeed}],
                    'metrics': [
                        {'expression': 'ga:users'},
                        {'expression': 'ga:newUsers'},
                        #{'expression': 'ga:pageviews'},
                        #{'expression': 'ga:bounces'},
                        #{'expression': 'ga:sessions'},
                        #{'expression': 'ga:searchUniques'},
                        #{'expression': 'ga:timeOnPage'}
                        {'expression': 'ga:searchUniques'},
                        {'expression': 'ga:bounces'},
                        {'expression': 'ga:bounceRate'},
                        {'expression': 'ga:sessionDuration'},
                        {'expression': 'ga:avgSessionDuration'},
                        {'expression': 'ga:avgTimeOnPage'},
                        {'expression': 'ga:organicSearches'}
                    ],
                    'dimensions': [
                        {'name': 'ga:userType'},
                        {'name': 'ga:sessionCount'},
                        {'name': 'ga:dateHourMinute'},
                        #{'name': 'ga:sourceMedium'},
                        #{'name': 'ga:deviceCategory'},
                        #{'name': 'ga:cityId'},
                        #{'name': 'ga:countryIsoCode'},
                        #{'name': 'ga:landingPagePath'},
                        #{'name': 'ga:pagePath'},
                        {'name': 'ga:campaign'},
                        #{'name' : 'ga:searchUniques'}
                        #{'name': 'ga:pagePathLevel1'},
                        #{'name': 'ga:pagePathLevel2'},
                        #{'name': 'ga:pagePathLevel3'},
                        #{'name': 'ga:pagePathLevel4'}
                    ],
                    'samplingLevel': 'LARGE',
                    'pageSize': PAGESIZE
                }]

            ##################################################################################

        }
    ).execute()


def handle_report(analytics, pagetoken, rows):
    response = get_report(analytics, pagetoken)

    # Header, Dimentions Headers, Metric Headers
    columnHeader = response.get("reports")[0].get('columnHeader', {})
    dimensionHeaders = columnHeader.get('dimensions', [])
    metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])

    # Pagination
    pagetoken = response.get("reports")[0].get('nextPageToken', None)

    # Rows
    rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
    rows = rows + rowsNew
    print("len(rows): " + str(len(rows)))
    print(pagetoken)
    print(response.get('nextPageToken'))

    #print(response.get('rows', []))

    # Recursivly query next page
    if pagetoken != None:
        return handle_report(analytics, pagetoken, rows)
    else:
        # nicer results
        nicerows = []
        for row in rows:
            dic = {}
            dimensions = row.get('dimensions', [])
            dateRangeValues = row.get('metrics', [])

            for header, dimension in zip(dimensionHeaders, dimensions):
                dic[header] = dimension

            for i, values in enumerate(dateRangeValues):
                for metric, value in zip(metricHeaders, values.get('values')):
                    if ',' in value or ',' in value:
                        dic[metric.get('name')] = float(value)
                    else:
                        dic[metric.get('name')] = float(value)
            nicerows.append(dic)
        return nicerows


# Start
def main():

    #while (RangeStart <= RangeEnd):
    analytics = initialize_analyticsreporting()

    global dfanalytics
    dfanalytics = []

    rows = []
    rows = handle_report(analytics, '0', rows)

    dfanalytics = pd.DataFrame(list(rows))
    YYYYmmddHHMMSS = datetime.today().strftime('%Y%m%d%H%M%S')
    dfanalytics.to_csv("./outputRAW/behaviour28/behaviour28days"+singleDateFeed+"_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS))
    dfanalytics.to_csv("./outputRAW/behaviour28/backupCopy/behaviour28days" + singleDateFeed + "_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS))

        #RangeStart += delta



#if __name__ == '__main__':
#    main()

def daterange(RangeStart,RangeEnd):

    for n in range(int((RangeEnd - RangeStart).days)):
        yield RangeStart + timedelta(n)

for singleDate in daterange(RangeStart,RangeEnd):
    #print(singleDate.strftime("%Y-%m-%d"))
    singleDateMinusSeven = singleDate + timedelta(days=-7)

    #print(singleDate + timedelta(days=-7))
    singleDateFeed = singleDate.strftime("%Y-%m-%d")
    startingDateFeed = singleDateMinusSeven.strftime("%Y-%m-%d")
    #endingDateFeed = startingDateFeed + timedelta(days=-7)
    #endingDateFeed = singleDate.strftime("%Y-%m-%d") + timedelta(days=-7)
    main()

    print(startingDateFeed)
    print(singleDateFeed)
    #print(endingDateFeed)
    #time.sleep(3) #Sleep for a number of secoonds before running the next day - to decrease number of api calls


#dateTest = datetime2.date(2022, 2, 1)
#print (dateTest, end="\n")
Help!

My code just keeps outputting: len(rows): 100000

100000

None

len(rows): 200000

100000

None

len(rows): 300000

100000

None

len(rows): 400000

100000

None

len(rows): 500000

100000

None

looping forever rather than pulling the next page.

I appreciate I'm not asking very well, but what basic thing have I missed to make it pull the next page and append?

 #Recursivly query next page
    if pagetoken != None:
        return handle_report(analytics, pagetoken, rows)
It's this bit that doesn't seem to work This runs fine when there are 57k rows, but when I expand the range, the above happens. Many thanks in advance, and thank you for your patience!