May-14-2020, 06:53 PM
Hello. Im trying to store multiple excel files into my database but need to match the rows of each excel row by Timestamp, ID, Content before inserting it. Therefor im using REGEX
Each Timestamp ID and Content can vary from row to row and from excel file to excel file.
Im stucking with an Errorcode i get, I hope you can help me out here.
This is my code :
Each Timestamp ID and Content can vary from row to row and from excel file to excel file.
Im stucking with an Errorcode i get, I hope you can help me out here.
This is my code :
import os import re import pymysql pymysql.install_as_MySQLdb() import pandas as pd import sqlalchemy def insert_or_update(engine, pd_table, table_name): inserts = 0 updates = 0 for i in range(len(pd_table)): vals_with_quotes = ["'" + str(x) + "'" for x in pd_table.loc[i, :].values] # print(vals_with_quotes) update_pairs = [str(c) + " = '" + str(v) + "'" for c, v in zip(pd_table.columns, pd_table.loc[i, :])] query = f"INSERT INTO {table_name} ({', '.join(list(pd_table.columns.values))}) " \ f"VALUES ({', '.join(vals_with_quotes)}) " \ f"ON DUPLICATE KEY UPDATE {', '.join(update_pairs)}" print(query) result = engine.execute(query) if result.lastrowid == 0: updates += 1 else: inserts += 1 print(f"Inserted {inserts} rows and updated {updates} rows.") schema = '---' alchemy_connect = "---" engine = sqlalchemy.create_engine(alchemy_connect) # connect to server engine.execute(f"USE {schema}") # select new db # engine.execute("SET NAMES UTF8MB4;") query = "SELECT * FROM --- where study=---" pm = pd.read_sql(query, engine) rootpath = "path/" for root, dirs, files in os.walk(rootpath): for file in files: print(root, dirs, files, file) d = pd.read_excel(root + file, header=None) d.drop(columns=[0], inplace=True) d.rename(columns={1: "content"}, inplace=True) participants = [] for ix, row in d.iterrows(): c = row["content"] match = re.search(r"^\[(\d{1,2}:\d{1,2}:\d{1,2})]\s+(\w+):\s*(.*)", c) if match: timestamp = match.group(1) id_code = match.group(2) only_content = match.group(3) if re.search(r"(?:Versuch(?:sleit(?:er)?)?|ersuchsleiter)", id_code): id_code = "Versuchsleiter" # get pid participant = pm.loc[pm["id_code"] == id_code, "pid"] try: pid = participant.values[0] except: pid = "Versuchsleiter" # print(ix, pid, id_code, only_content, timestamp) if pid and pid not in participants and pid != "Versuchsleiter": participants.append(pid) d.loc[ix, "pid"] = pid d.loc[ix, "timestamp"] = timestamp d.loc[ix, "content"] = only_content.strip() d.loc[ix, "is_participant"] = 0 if pid == "Versuchsleiter" else 1 d = d[["pid", "is_participant", "content", "timestamp"]] d.loc[(d['pid'] == "Versuchsleiter"), "pid"] = participants[0] d.loc[(d['pid'] == None), "pid"] = participants[0] # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print(d) insert_or_update(engine, d, "YM_Transcripts")And this is the Errorcode i get :
File "C:/Users/.../PycharmProjects/.../.../---.py", line 20, in insert_or_update result = engine.execute(query) File "C:\Users\...\PycharmProjects\...\...\lib\site-packages\sqlalchemy\engine\base.py", line 2191, in execute return connection.execute(statement, *multiparams, **params) TypeError: not enough arguments for format string