Jan-31-2023, 04:30 AM
Better and faster than before. 20,000 record table in about 0.16 seconds. You can ignore the slice/copy warning.
import pandas as pd import numpy as np from time import time # Make some dummy dataframes # df1 has datetime, windspeed and pressure. Pressures are all 0/empty # df2 has datetime and pressure df1 = pd.DataFrame({ "Datetime": pd.date_range("1/1/2022", "1/1/2023", 20000), "WindSpeed": np.random.randint(2, size=20000), "Pressure": np.zeros(20000) }) df2 = pd.DataFrame({ "Datetime": pd.date_range("1/1/2022", "1/1/2023", 15000), "Pressure": np.random.random(15000) * 5 + 1, }) starting_time = time() # We are only interested in wind events. No reason to look at other rows. # We can roll changes back into df1 later on using the row index values. wind_events = df1[df1["WindSpeed"] > 0] # Extract the time and pressure info. Faster to work with these # as lists instead of getting from dataframes each time. a_times = wind_events["Datetime"].values b_times = df2["Datetime"].values a_pressure = wind_events["Pressure"].values b_pressure = df2["Pressure"].values # For each wind event, find the pressure from df2 that has the # closest timestamp. Only accept pressues within time window. window = np.timedelta64(10, "m") b = 0 for a, a_time in enumerate(a_times): best_diff = prev_diff = abs(b_times[b] - a_time) best_index = b for b in range(b+1, len(b_times)): diff = abs(b_times[b] - a_time) if diff < best_diff: best_diff, best_index = diff, b if diff > prev_diff: # We are not going to get a better match break prev_diff = diff if best_index is not None and best_diff <= window: a_pressure[a] = b_pressure[best_index] # Replace wind_events["Pressure"] with a_pressure list and fold # wind_events back into df1. wind_events["Pressure"] = a_pressure df1.update(wind_events["Pressure"]) print(time() - starting_time)