Jun-04-2023, 03:56 PM
Do you do something to create
Ohh 500 columns is hell of DataFrame,not gone look for error in that.
merged_df_nan
i mean for me it dos not work if i just read sample.xlsx
?Ohh 500 columns is hell of DataFrame,not gone look for error in that.
import pandas as pd from tqdm import tqdm def average_col(merged_df_nan): #Filling NaN values for columns with less than 20k NaNs with the average of the period before and after na_counts = merged_df_nan.isna().sum() columns_to_fill2 = na_counts[(na_counts < 20000) & (~na_counts.index.isin(['Text', 'gvkey', 'datadate', 'prccq', 'naics', 'sic', 'processed_text', 'stemmed_text', 'sentiment_score', 'fyr', 'indfmt','consol','popsrc','tic','conm','apdedateq','fdateq','rdq']))].index #Iterate over the filtered columns and fill NA values with the average of previous and next non-NA values for column in tqdm(columns_to_fill2, desc = 'Columns done'): na_mask = merged_df_nan[column].isna() for index, row in merged_df_nan.iterrows(): if na_mask[index]: fyearq = row['fyearq'] fqtr = row['fqtr'] cik = row['cik'] #Looking for the values and taking the average of them prev_value = merged_df_nan.loc[(merged_df_nan['cik'] == cik) & ((merged_df_nan['fyearq'] < fyearq) | ((merged_df_nan['fyearq'] == fyearq) & (merged_df_nan['fqtr'] < fqtr)))].sort_values(['fyearq', 'fqtr'], ascending=[False, False]).head(1)[column].item() next_value = merged_df_nan.loc[(merged_df_nan['cik'] == cik) & ((merged_df_nan['fyearq'] > fyearq) | ((merged_df_nan['fyearq'] == fyearq) & (merged_df_nan['fqtr'] > fqtr)))].sort_values(['fyearq', 'fqtr'], ascending=[True, True]).head(1)[column].item() average_value = np.mean([prev_value, next_value]) merged_df_nan.loc[index, column] = average_value return merged_df_nan if __name__ == '__main__': merged_df_nan = pd.read_excel('sample.xlsx') res = average_col(merged_df_nan)
Error:Traceback (most recent call last):
File "G:\div_code\hex\game\av.py", line 28, in <module>
res = average_col(merged_df_nan)
File "G:\div_code\hex\game\av.py", line 20, in average_col
prev_value = merged_df_nan.loc[(merged_df_nan['cik'] == cik) & ((merged_df_nan['fyearq'] < fyearq) | ((merged_df_nan['fyearq'] == fyearq) & (merged_df_nan['fqtr'] < fqtr)))].sort_values(['fyearq', 'fqtr'], ascending=[False, False]).head(1)[column].item()
File "C:\Python310\lib\site-packages\pandas\core\base.py", line 349, in item
raise ValueError("can only convert an array of size 1 to a Python scalar")
ValueError: can only convert an array of size 1 to a Python scalar