![]() |
Find strings by index from a list of indexes in a different Pandas dataframe column - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Data Science (https://python-forum.io/forum-44.html) +--- Thread: Find strings by index from a list of indexes in a different Pandas dataframe column (/thread-42725.html) |
Find strings by index from a list of indexes in a different Pandas dataframe column - Calab - Aug-23-2024 Consider the following Pandas dataframes... df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],}, {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}]) df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"}, {"site_id": "2", "name": "Calgary"}, {"site_id": "3", "name": "Chicago"}, {"site_id": "4", "name": "New York"}, {"site_id": "5", "name": "Los Angeles"},])How can I determine if "Calgary" is or is not in the area "One"? How can I determine if "Calgary" is or is not in the area "Two"? How can I determine that area "Two" contains a site id that does exist in df2? How can I determine that "Boston" is not any of the sites? I've been wrapping my brain around this for a few hours and I can't come up with an answers. :( RE: Find strings by index from a list of indexes in a different Pandas dataframe column - Pedroski55 - Aug-25-2024 I am no expert, but I think this will do what you want. import pandas as pd df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],}, {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}]) df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"}, {"site_id": "2", "name": "Calgary"}, {"site_id": "3", "name": "Chicago"}, {"site_id": "4", "name": "New York"}, {"site_id": "5", "name": "Los Angeles"},]) # How can I determine if "Calgary" is or is not in the area "One"? # get the site_id for any row whose name column contains Calgary cond1 = (df2['name'] == 'Calgary') res1 = df2[cond1].site_id.values.tolist() cond2 = df1["id"] == "One" res2 = df1[cond2].served_sites.values.tolist() for v in res2[0]: if v in res1: print(f'{v} is in Area One and in df2') #How can I determine if "Calgary" is or is not in the area "Two"? # see above #How can I determine that area "Two" contains a site id that does exist in df2? cond3 = (df1['id'] == 'Two') res3 = df1[cond3].served_sites.values.tolist() for v in res3[0]: if v in df2["site_id"].values.tolist(): print(f'{v} is in Area Two and in df2') # How can I determine that "Boston" is not any of the sites? cond4 = (df2['name'] == 'Boston') res4 = df2[cond4].site_id.values.tolist() # if True then Boston is not found in df2['name'] len(res4) == 0 # True RE: Find strings by index from a list of indexes in a different Pandas dataframe column - deanhystad - Aug-26-2024 I don't think pandas is good for that kind of thing. I would convert to dictionaries. import pandas as pd df1 = pd.DataFrame( data=[ { "id": "One", "served_sites": ["1", "2", "3"], }, { "id": "Two", "served_sites": ["4", "5", "6"], }, ] ) df2 = pd.DataFrame( data=[ {"site_id": "1", "name": "Toronto"}, {"site_id": "2", "name": "Calgary"}, {"site_id": "3", "name": "Chicago"}, {"site_id": "4", "name": "New York"}, {"site_id": "5", "name": "Los Angeles"}, ] ) site_names = dict(zip(df2.site_id, df2.name)) area_sites = dict(zip(df1.id, df1.served_sites)) for name, sites in area_sites.items(): area_sites[name] = [site_names.get(site, "undefined") for site in sites] print(area_sites) print("Is Calgary in area One?", "Calgary" in area_sites["One"]) print("Is Calgary in area Two?", "Calgary" in area_sites["Two"]) print("All sites common to areas One and Two", set(area_sites["One"]) & set(area_sites["Two"])) RE: Find strings by index from a list of indexes in a different Pandas dataframe column - Calab - Aug-26-2024 (Aug-23-2024, 06:34 PM)Calab Wrote: Consider the following Pandas dataframes... This is what I eventually ended up with on my own: import pandas as pd df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],}, {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}]) df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"}, {"site_id": "2", "name": "Calgary"}, {"site_id": "3", "name": "Chicago"}, {"site_id": "4", "name": "New York"}, {"site_id": "5", "name": "Los Angeles"},]) ##### Is Calgary served by area One? find_site = "Calgary" find_area = "One" site_idx = df2.query("name == '{}'".format(find_site))["site_id"].values[0] served_sites = df1.loc[df1["id"] == find_area]["served_sites"].values[0] if site_idx in served_sites: print("{} is served by area {}".format(find_site, find_area)) else: print("{} is NOT served by area {}".format(find_site, find_area)) ##### Is Calgary served by area Two? find_area = "Two" site_idx = df2.query("name == '{}'".format(find_site))["site_id"].values[0] served_sites = df1.loc[df1["id"] == find_area]["served_sites"].values[0] if site_idx in served_sites: print("{} is served by area {}".format(find_site, find_area)) else: print("{} is NOT served by area {}".format(find_site, find_area)) ##### Does area Two have any invalid sites? verify_area = "Two" verify_served_sites = df1.loc[df1["id"] == verify_area]["served_sites"].values[0] for s_site in verify_served_sites: find_site = df2.loc[df2["site_id"] == s_site]["site_id"] if find_site.empty: print("Area {} has an invalid site id {}".format(verify_area, s_site)) ##### Is Boston a site? find_site = "Boston" found_id = df2.query("name == '{}'".format(find_site)) if found_id.empty: print("{} is NOT a site".format(find_site)) else: print("{} is a site".format(find_site)) |