Python Forum
(Python) Pulling data from UA Google Analytics with more than 100k rows into csv.
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
(Python) Pulling data from UA Google Analytics with more than 100k rows into csv.
#1
(Python) Pulling data from UA Google Analytics with more than 100k rows into csv. Works fine for 57k

import pandas as pd
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
import datetime as datetime2
from datetime import date, timedelta
import time
from dates import StartDay, StartMonth, StartYear, EndDay,EndYear,EndMonth

SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = './XXXXXXX.json'
VIEW_ID = 'XXXXXXX'
PAGESIZE = 100000

#StartDate = '2022-10-13'
#EndDate = '2022-10-14'

#RangeStart = datetime2.date(2022, 9, 1)
#RangeEnd = datetime2.date(2022, 9, 1)
#RangeStart = datetime2.date(2022, 1, 3) # from and inluding this date
#RangeEnd = datetime2.date(2022, 1, 4) # up to this date (not including)


RangeStart = datetime2.date(StartYear, StartMonth, StartDay) # from and inluding this date
RangeEnd = datetime2.date(EndYear, EndMonth, EndDay) # up to this date (not including)

delta = datetime2.timedelta(days=1)

#main.RangeStart

def initialize_analyticsreporting():
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        KEY_FILE_LOCATION, SCOPES)
    analytics = build('analyticsreporting', 'v4', credentials=credentials)
    return analytics


# Get one report page
def get_report(analytics, pageTokenVar):
    return analytics.reports().batchGet(
        body={

            ##################################################################################

            'reportRequests': [
                {
                    'viewId': VIEW_ID,
                    'dateRanges': [{'startDate': startingDateFeed, 'endDate': singleDateFeed}],
                    'metrics': [
                        {'expression': 'ga:users'},
                        {'expression': 'ga:newUsers'},
                        #{'expression': 'ga:pageviews'},
                        #{'expression': 'ga:bounces'},
                        #{'expression': 'ga:sessions'},
                        #{'expression': 'ga:searchUniques'},
                        #{'expression': 'ga:timeOnPage'}
                        {'expression': 'ga:searchUniques'},
                        {'expression': 'ga:bounces'},
                        {'expression': 'ga:bounceRate'},
                        {'expression': 'ga:sessionDuration'},
                        {'expression': 'ga:avgSessionDuration'},
                        {'expression': 'ga:avgTimeOnPage'},
                        {'expression': 'ga:organicSearches'}
                    ],
                    'dimensions': [
                        {'name': 'ga:userType'},
                        {'name': 'ga:sessionCount'},
                        {'name': 'ga:dateHourMinute'},
                        #{'name': 'ga:sourceMedium'},
                        #{'name': 'ga:deviceCategory'},
                        #{'name': 'ga:cityId'},
                        #{'name': 'ga:countryIsoCode'},
                        #{'name': 'ga:landingPagePath'},
                        #{'name': 'ga:pagePath'},
                        {'name': 'ga:campaign'},
                        #{'name' : 'ga:searchUniques'}
                        #{'name': 'ga:pagePathLevel1'},
                        #{'name': 'ga:pagePathLevel2'},
                        #{'name': 'ga:pagePathLevel3'},
                        #{'name': 'ga:pagePathLevel4'}
                    ],
                    'samplingLevel': 'LARGE',
                    'pageSize': PAGESIZE
                }]

            ##################################################################################

        }
    ).execute()


def handle_report(analytics, pagetoken, rows):
    response = get_report(analytics, pagetoken)

    # Header, Dimentions Headers, Metric Headers
    columnHeader = response.get("reports")[0].get('columnHeader', {})
    dimensionHeaders = columnHeader.get('dimensions', [])
    metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])

    # Pagination
    pagetoken = response.get("reports")[0].get('nextPageToken', None)

    # Rows
    rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
    rows = rows + rowsNew
    print("len(rows): " + str(len(rows)))
    print(pagetoken)
    print(response.get('nextPageToken'))

    #print(response.get('rows', []))

    # Recursivly query next page
    if pagetoken != None:
        return handle_report(analytics, pagetoken, rows)
    else:
        # nicer results
        nicerows = []
        for row in rows:
            dic = {}
            dimensions = row.get('dimensions', [])
            dateRangeValues = row.get('metrics', [])

            for header, dimension in zip(dimensionHeaders, dimensions):
                dic[header] = dimension

            for i, values in enumerate(dateRangeValues):
                for metric, value in zip(metricHeaders, values.get('values')):
                    if ',' in value or ',' in value:
                        dic[metric.get('name')] = float(value)
                    else:
                        dic[metric.get('name')] = float(value)
            nicerows.append(dic)
        return nicerows


# Start
def main():

    #while (RangeStart <= RangeEnd):
    analytics = initialize_analyticsreporting()

    global dfanalytics
    dfanalytics = []

    rows = []
    rows = handle_report(analytics, '0', rows)

    dfanalytics = pd.DataFrame(list(rows))
    YYYYmmddHHMMSS = datetime.today().strftime('%Y%m%d%H%M%S')
    dfanalytics.to_csv("./outputRAW/behaviour28/behaviour28days"+singleDateFeed+"_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS))
    dfanalytics.to_csv("./outputRAW/behaviour28/backupCopy/behaviour28days" + singleDateFeed + "_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS))

        #RangeStart += delta



#if __name__ == '__main__':
#    main()

def daterange(RangeStart,RangeEnd):

    for n in range(int((RangeEnd - RangeStart).days)):
        yield RangeStart + timedelta(n)

for singleDate in daterange(RangeStart,RangeEnd):
    #print(singleDate.strftime("%Y-%m-%d"))
    singleDateMinusSeven = singleDate + timedelta(days=-7)

    #print(singleDate + timedelta(days=-7))
    singleDateFeed = singleDate.strftime("%Y-%m-%d")
    startingDateFeed = singleDateMinusSeven.strftime("%Y-%m-%d")
    #endingDateFeed = startingDateFeed + timedelta(days=-7)
    #endingDateFeed = singleDate.strftime("%Y-%m-%d") + timedelta(days=-7)
    main()

    print(startingDateFeed)
    print(singleDateFeed)
    #print(endingDateFeed)
    #time.sleep(3) #Sleep for a number of secoonds before running the next day - to decrease number of api calls


#dateTest = datetime2.date(2022, 2, 1)
#print (dateTest, end="\n")
Help!

My code just keeps outputting: len(rows): 100000

100000

None

len(rows): 200000

100000

None

len(rows): 300000

100000

None

len(rows): 400000

100000

None

len(rows): 500000

100000

None

looping forever rather than pulling the next page.

I appreciate I'm not asking very well, but what basic thing have I missed to make it pull the next page and append?

 #Recursivly query next page
    if pagetoken != None:
        return handle_report(analytics, pagetoken, rows)
It's this bit that doesn't seem to work This runs fine when there are 57k rows, but when I expand the range, the above happens. Many thanks in advance, and thank you for your patience!
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  New on python. Needs help with Google sheet jeromep 1 142 Apr-25-2024, 06:47 PM
Last Post: deanhystad
  how do you style data frame that has empty rows. gsaray101 0 536 Sep-08-2023, 05:20 PM
Last Post: gsaray101
  Pulling Specifics Words/Numbers from String bigpapa 2 772 May-01-2023, 07:22 PM
Last Post: bigpapa
  Having trouble installing scikit-learn via VSC and pulling my hair out pythonturtle 1 767 Feb-07-2023, 02:23 AM
Last Post: Larz60+
  How to properly format rows and columns in excel data from parsed .txt blocks jh67 7 1,890 Dec-12-2022, 08:22 PM
Last Post: jh67
  Pulling username from Tuple pajd 21 3,432 Oct-07-2022, 01:33 PM
Last Post: pajd
  Deploy Python to Cloud and save output to Google Drive chandrabr80 2 1,590 Jan-25-2022, 06:56 AM
Last Post: ndc85430
  The code I have written removes the desired number of rows, but wrong rows Jdesi1983 0 1,636 Dec-08-2021, 04:42 AM
Last Post: Jdesi1983
  install apache-airflow[postgres,google] on Python 3.8.12 virtual env ShahajaK 1 7,835 Oct-07-2021, 03:05 PM
Last Post: Larz60+
  Iterating Through Data Frame Rows JoeDainton123 2 2,945 Aug-09-2021, 07:01 AM
Last Post: Pedroski55

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020