Nov-29-2023, 06:35 PM
(This post was last modified: Nov-29-2023, 07:37 PM by snippsat.
Edit Reason: Added code tag
)
I cant rally code, just use chat gbt to write it for me and it gave me almost a working code. What it does it take a .csv file and expands the columns based on the duplicates in the first column. But the problem is with the column names, it probably easier to who show:
Here is the column names for the input file:
Here is the column names for the input file:
Output:"link" "education1 school" "education1 degree" "education1 start date" "education1 end date" "education1 grade" "education1 description"
Here is the output column names right now:Output:"link" "education1 school" "education2 degree" "education3 start date" "education4 end date" "education5 grade" "education6 description" "education7 school" "education8 degree" "education9 start date" "education10 end date" "education11 grade" "education12 description" "education13 school" "education14 degree" "education15 start date" "education16 end date" "education17 grade" "education18 description"
Here is the desired output column names:Output:"link" "education1 school" "education1 degree" "education1 start date" "education1 end date" "education1 grade" "education1 description" "education2 school" "education2 degree" "education2 start date" "education2 end date" "education2 grade" "education2 description" "education3 school" "education3 degree" "education3 start date" "education3 end date" "education3 grade" "education3 description"
No matter in how many creative ways I ask it to help me it still can't. I would appreciate if you could help me, here's the code:import pandas as pd import argparse import os import csv import re def expand_duplicates(df): expanded_rows = [] pattern = re.compile(r'^(.*?)(\d+)(.*?)$', re.I) for key_value in df.iloc[:, 0].unique(): group = df[df.iloc[:, 0] == key_value] expanded_row = {df.columns[0]: key_value} col_counter = {} for _, row in group.iterrows(): for col in df.columns[1:]: match = pattern.match(col) if match: col_name, col_num, col_suffix = match.groups() col_key = f"{col_name}{col_counter.get(col_name, 1)}{col_suffix}" col_counter[col_name] = col_counter.get(col_name, 1) + 1 expanded_row[col_key] = row[col] expanded_rows.append(expanded_row) df_expanded = pd.DataFrame(expanded_rows) return df_expanded def get_next_filename(output_file): base, ext = os.path.splitext(output_file) dir_name = os.path.dirname(output_file) file_name = os.path.basename(base) counter = 1 new_file = f"{file_name}({counter}){ext}" while os.path.exists(os.path.join(dir_name, new_file)): counter += 1 new_file = f"{file_name}({counter}){ext}" return os.path.join(dir_name, new_file) def main(input_file, output_file, separator='\t'): df = pd.read_csv(input_file, sep=separator, quotechar='"', quoting=csv.QUOTE_MINIMAL) df_expanded = expand_duplicates(df) if os.path.exists(output_file): output_file = get_next_filename(output_file) df_expanded.to_csv(output_file, index=False, sep=separator, quoting=csv.QUOTE_NONNUMERIC, quotechar='"') if __name__ == "__main__": parser = argparse.ArgumentParser(description="Expand duplicates in a CSV file.") parser.add_argument("-f", "--input_file", help="Path to the input CSV file", required=True) parser.add_argument("-d", "--output_dir", help="Path to the output directory", default="./data/out") parser.add_argument("-s", "--separator", help="Column separator for input and output files", default='\t') args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) input_filename = os.path.basename(args.input_file) output_file = os.path.join(args.output_dir, input_filename) main(args.input_file, output_file, separator=args.separator)