Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Need help with a code.
#1
I cant rally code, just use chat gbt to write it for me and it gave me almost a working code. What it does it take a .csv file and expands the columns based on the duplicates in the first column. But the problem is with the column names, it probably easier to who show:

Here is the column names for the input file:
Output:
"link" "education1 school" "education1 degree" "education1 start date" "education1 end date" "education1 grade" "education1 description"
Here is the output column names right now:
Output:
"link" "education1 school" "education2 degree" "education3 start date" "education4 end date" "education5 grade" "education6 description" "education7 school" "education8 degree" "education9 start date" "education10 end date" "education11 grade" "education12 description" "education13 school" "education14 degree" "education15 start date" "education16 end date" "education17 grade" "education18 description"
Here is the desired output column names:
Output:
"link" "education1 school" "education1 degree" "education1 start date" "education1 end date" "education1 grade" "education1 description" "education2 school" "education2 degree" "education2 start date" "education2 end date" "education2 grade" "education2 description" "education3 school" "education3 degree" "education3 start date" "education3 end date" "education3 grade" "education3 description"
No matter in how many creative ways I ask it to help me it still can't. I would appreciate if you could help me, here's the code:
import pandas as pd
import argparse
import os
import csv
import re

def expand_duplicates(df):
    expanded_rows = []

    pattern = re.compile(r'^(.*?)(\d+)(.*?)$', re.I)

    for key_value in df.iloc[:, 0].unique():
        group = df[df.iloc[:, 0] == key_value]
        expanded_row = {df.columns[0]: key_value}

        col_counter = {}

        for _, row in group.iterrows():
            for col in df.columns[1:]:
                match = pattern.match(col)
                if match:
                    col_name, col_num, col_suffix = match.groups()
                    col_key = f"{col_name}{col_counter.get(col_name, 1)}{col_suffix}"
                    col_counter[col_name] = col_counter.get(col_name, 1) + 1

                    expanded_row[col_key] = row[col]

        expanded_rows.append(expanded_row)

    df_expanded = pd.DataFrame(expanded_rows)
    return df_expanded

def get_next_filename(output_file):
    base, ext = os.path.splitext(output_file)
    dir_name = os.path.dirname(output_file)
    file_name = os.path.basename(base)
    counter = 1
    new_file = f"{file_name}({counter}){ext}"

    while os.path.exists(os.path.join(dir_name, new_file)):
        counter += 1
        new_file = f"{file_name}({counter}){ext}"

    return os.path.join(dir_name, new_file)

def main(input_file, output_file, separator='\t'):
    df = pd.read_csv(input_file, sep=separator, quotechar='"', quoting=csv.QUOTE_MINIMAL)

    df_expanded = expand_duplicates(df)

    if os.path.exists(output_file):
        output_file = get_next_filename(output_file)

    df_expanded.to_csv(output_file, index=False, sep=separator, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Expand duplicates in a CSV file.")
    parser.add_argument("-f", "--input_file", help="Path to the input CSV file", required=True)
    parser.add_argument("-d", "--output_dir", help="Path to the output directory", default="./data/out")
    parser.add_argument("-s", "--separator", help="Column separator for input and output files", default='\t')
    args = parser.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)

    input_filename = os.path.basename(args.input_file)
    output_file = os.path.join(args.output_dir, input_filename)

    main(args.input_file, output_file, separator=args.separator)
snippsat write Nov-29-2023, 07:37 PM:
Added code tag in your post,look at BBCode on how to use.
Reply
#2
Post a sample of the .csv file.
Reply
#3
(Nov-29-2023, 07:54 PM)snippsat Wrote: Post a sample of the .csv file.

Attached Files

.csv   input.csv (Size: 3.89 KB / Downloads: 22)
.csv   current_output.csv (Size: 4.09 KB / Downloads: 39)
.csv   desired_output.csv (Size: 4.08 KB / Downloads: 26)
Reply
#4
Try something like this.
import pandas as pd
pd.set_option('display.expand_frame_repr', False)

def expand_duplicates(df):
    expanded_rows = []
    for link in df['link'].unique():
        link_df = df[df['link'] == link]
        expanded_row = {'link': link}
        counters = {
            'school': 1,
            'degree': 1,
            'start date': 1,
            'end date': 1,
            'grade': 1,
            'description': 1
        }
        for _, row in link_df.iterrows():
            for col_type in counters.keys():
                col_name = f'education{counters[col_type]} {col_type}'
                expanded_row[col_name] = row[f'education1 {col_type}']
                counters[col_type] += 1
        expanded_rows.append(expanded_row)
    return pd.DataFrame(expanded_rows)

input_path = 'input.csv'
output_path = 'output.csv'
input_df = pd.read_csv(input_path, sep='\t')
expanded_df = expand_duplicates(input_df)
#print(expanded_df)
expanded_df.to_csv(output_path, index=False, sep='\t')
Reply
#5
Thanks for you time, but that is not what I need, as I said I don't know how to program but I see you added my column names and file names from the sample in to the code itself, I need it to work regardless of the column names and with different file names (It actually already does it has a parameter for that "-f").
The problem is with the enumeration int the column names after expanding, I need it to group the original names and then add the next number to the whole group not to the individual column.
Reply
#6
Can try like this.
I think there most be a better way than duplicate which make i kind of messy.
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
import re

def expand_duplicates(df):
    expanded_rows = []
    pattern = re.compile(r'^(.*?)(\d+)(.*?)$')
    for key_value in df.iloc[:, 0].unique():
        group = df[df.iloc[:, 0] == key_value]
        expanded_row = {df.columns[0]: key_value}
        group_counter = 1
        for _, row in group.iterrows():
            for col in df.columns[1:]:
                match = pattern.match(col)
                if match:
                    col_prefix, _, col_suffix = match.groups()
                    col_key = f"{col_prefix}{group_counter}{col_suffix}"
                    expanded_row[col_key] = row[col]
            group_counter += 1
        expanded_rows.append(expanded_row)
    return pd.DataFrame(expanded_rows)

input_path = 'input.csv'
output_path = 'output.csv'
input_df = pd.read_csv(input_path, sep='\t')
expanded_df = expand_duplicates(input_df)
#print(expanded_df)
expanded_df.to_csv(output_path, index=False, sep='\t')
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020