Python Forum

Full Version: python multiprocessing help -- to extract 10 sql table into csv
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hi Team,

Need your help in improving my existing code using multithreading or multiprocessing as per below situation.

I have 10 sql tables in a single database.
I want to extract all sql tables parallelly.

Without affecting memory issue or slowness issue , by other team members.
Suppose I have below 9 tables in Customer Database.

I will accept database name as argument from user.

SQL Table List
Table1 ,Table2,Table3,Table4,Table5,Table6,Table7,Table8,Table9,Table10.


import pyodbc
import csv
import os

connection = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=DESKTOP-GQK64O6;DATABASE=Customer;Trusted_Connection=yes;')

cursor = connection.cursor()
qry = f"Select * from Table1"
cursor.execute(qry)

data = cursor.fetchall()
print(len(data))

folderPath = f"C:\\Users\\malle\\OneDrive\\Desktop\\C\\test_data\\output{tbl}"

count = 0
**# for reply in cursor.execute(qry):**
for x in data:
    count = count + 1
    print(count)
    fname = "Row"+str(count)+".csv"
    fullpath = os.path.join(folderPath,fname)
    print(fullpath)
    with open(fullpath,"w",newline="") as outfile:
        writer = csv.writer(outfile,delimiter="|",quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(col[0] for col in cursor.description)
        writer.writerow(x)
        print(f"I am row {count}",x)
cursor.close()
connection.close()
Accessing the same DB with many processes/threads will slow down the computer because they all bump into each other trying to access the same DB. Consider reading each table into memory one at a time and passing that to function called by multiprocessing.
Agree, somewhat. If all the bottlenecks are I/O (database queries and writing files) it's most likely that the only thing multi-tasking or multi-processing would do is slow things down.

But I would still do an experiment. A small test case that you can put together quick and decide how to proceed. I'd focus on the database queries as that is the slowest thing you are doing.
is this correct way of doing multi processing.


import pyodbc
import csv
import os
import time
import multiprocessing


def Extract_to_csv(tbl,cursor):
    qry = f"Select * from {tbl}"
    cursor.execute(qry)
    data = cursor.fetchall()
    print(len(data))
    folderPath = f"D:\\test_data\\output{tbl}"
    count = 0
    # for reply in cursor.execute(qry):**
    for x in data:
        count = count + 1
        print(count)
        fname = "Row"+str(count)+".csv"
        fullpath = os.path.join(folderPath,fname)
        print(fullpath)
        with open(fullpath,"w",newline="") as outfile:
            writer = csv.writer(outfile,delimiter="|",quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(col[0] for col in cursor.description)
            writer.writerow(x)
            print(f"I am row {count}",x)

def main():
    connection = pyodbc.connect(
        'DRIVER={ODBC Driver 17 for SQL Server};SERVER=DESKTOP-GQK64O6;DATABASE=Customer;Trusted_Connection=yes;')
    cursor = connection.cursor()

    p1 = multiprocessing.Process(target=Extract_to_csv, args=(table1,cursor,))
    p2 = multiprocessing.Process(target=Extract_to_csv, args=(table2,cursor,))
    p3 = multiprocessing.Process(target=Extract_to_csv, args=(table3,cursor,))
    p4 = multiprocessing.Process(target=Extract_to_csv, args=(table4,cursor,))
    p5 = multiprocessing.Process(target=Extract_to_csv, args=(table5,cursor,))
    p6 = multiprocessing.Process(target=Extract_to_csv, args=(table6,cursor,))
    p7 = multiprocessing.Process(target=Extract_to_csv, args=(table7,cursor,))
    p8 = multiprocessing.Process(target=Extract_to_csv, args=(table8,cursor,))
    p9 = multiprocessing.Process(target=Extract_to_csv, args=(table9,cursor,))
    p10 = multiprocessing.Process(target=Extract_to_csv, args=(table10,cursor,))

    p1.start()
    p2.start()
    p3.start()
    p4.start()
    p5.start()
    p6.start()
    p7.start()
    p8.start()
    p9.start()
    p10.start()

    p1.join()
    p2.join()
    p3.join()
    p4.join()
    p5.join()
    p6.join()
    p7.join()
    p8.join()
    p9.join()
    p10.join()
    
    print("End of main")


if __name__ == "__main__":
    start_time = time.process_time()
    main()
    end_time = time.process_time()
    print("Execution time is : ", end_time - start_time)