Find strings by index from a list of indexes in a different Pandas dataframe column

Find strings by index from a list of indexes in a different Pandas dataframe column - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Data Science (https://python-forum.io/forum-44.html)
+--- Thread: Find strings by index from a list of indexes in a different Pandas dataframe column (/thread-42725.html)

Find strings by index from a list of indexes in a different Pandas dataframe column - Calab - Aug-23-2024

Consider the following Pandas dataframes...

df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],},
                           {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}])

df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"},
                           {"site_id": "2", "name": "Calgary"},
                           {"site_id": "3", "name": "Chicago"},
                           {"site_id": "4", "name": "New York"},
                           {"site_id": "5", "name": "Los Angeles"},])

How can I determine if "Calgary" is or is not in the area "One"?
How can I determine if "Calgary" is or is not in the area "Two"?
How can I determine that area "Two" contains a site id that does exist in df2?
How can I determine that "Boston" is not any of the sites?

I've been wrapping my brain around this for a few hours and I can't come up with an answers. :(

RE: Find strings by index from a list of indexes in a different Pandas dataframe column - Pedroski55 - Aug-25-2024

I am no expert, but I think this will do what you want.

import pandas as pd

df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],},
                           {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}])
 
df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"},
                           {"site_id": "2", "name": "Calgary"},
                           {"site_id": "3", "name": "Chicago"},
                           {"site_id": "4", "name": "New York"},
                           {"site_id": "5", "name": "Los Angeles"},])

# How can I determine if "Calgary" is or is not in the area "One"?
# get the site_id for any row whose name column contains Calgary
cond1 = (df2['name'] == 'Calgary')
res1 = df2[cond1].site_id.values.tolist()
cond2 = df1["id"] == "One"
res2 = df1[cond2].served_sites.values.tolist()
for v in res2[0]:
    if v in res1:
        print(f'{v} is in Area One and in df2')

#How can I determine if "Calgary" is or is not in the area "Two"?
# see above
#How can I determine that area "Two" contains a site id that does exist in df2?
cond3 = (df1['id'] == 'Two')
res3 = df1[cond3].served_sites.values.tolist()
for v in res3[0]:
    if v in df2["site_id"].values.tolist():
        print(f'{v} is in Area Two and in df2')
# How can I determine that "Boston" is not any of the sites?
cond4 = (df2['name'] == 'Boston')
res4 = df2[cond4].site_id.values.tolist()
# if True then Boston is not found in df2['name']
len(res4) == 0 # True

RE: Find strings by index from a list of indexes in a different Pandas dataframe column - deanhystad - Aug-26-2024

I don't think pandas is good for that kind of thing. I would convert to dictionaries.

import pandas as pd

df1 = pd.DataFrame(
    data=[
        {
            "id": "One",
            "served_sites": ["1", "2", "3"],
        },
        {
            "id": "Two",
            "served_sites": ["4", "5", "6"],
        },
    ]
)

df2 = pd.DataFrame(
    data=[
        {"site_id": "1", "name": "Toronto"},
        {"site_id": "2", "name": "Calgary"},
        {"site_id": "3", "name": "Chicago"},
        {"site_id": "4", "name": "New York"},
        {"site_id": "5", "name": "Los Angeles"},
    ]
)

site_names = dict(zip(df2.site_id, df2.name))
area_sites = dict(zip(df1.id, df1.served_sites))
for name, sites in area_sites.items():
    area_sites[name] = [site_names.get(site, "undefined") for site in sites]
print(area_sites)
print("Is Calgary in area One?", "Calgary" in area_sites["One"])
print("Is Calgary in area Two?", "Calgary" in area_sites["Two"])
print("All sites common to areas One and Two", set(area_sites["One"]) & set(area_sites["Two"]))

RE: Find strings by index from a list of indexes in a different Pandas dataframe column - Calab - Aug-26-2024

(Aug-23-2024, 06:34 PM)Calab Wrote: Consider the following Pandas dataframes...
df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],},
                           {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}])

df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"},
                           {"site_id": "2", "name": "Calgary"},
                           {"site_id": "3", "name": "Chicago"},
                           {"site_id": "4", "name": "New York"},
                           {"site_id": "5", "name": "Los Angeles"},])
How can I determine if "Calgary" is or is not in the area "One"?
How can I determine if "Calgary" is or is not in the area "Two"?
How can I determine that area "Two" contains a site id that does exist in df2?
How can I determine that "Boston" is not any of the sites?

This is what I eventually ended up with on my own:

import pandas as pd
 
df1 = pd.DataFrame(data = [{"id": "One", "served_sites": ["1", "2" ,"3"],},
                           {"id": "Two", "served_sites": ["4" ,"5" ,"6"],}])
  
df2 = pd.DataFrame(data = [{"site_id": "1", "name": "Toronto"},
                           {"site_id": "2", "name": "Calgary"},
                           {"site_id": "3", "name": "Chicago"},
                           {"site_id": "4", "name": "New York"},
                           {"site_id": "5", "name": "Los Angeles"},])

##### Is Calgary served by area One?

find_site = "Calgary"
find_area = "One"

site_idx = df2.query("name == '{}'".format(find_site))["site_id"].values[0]

served_sites = df1.loc[df1["id"] == find_area]["served_sites"].values[0]

if site_idx in served_sites:
    print("{} is served by area {}".format(find_site, find_area))
else:
    print("{} is NOT served by area {}".format(find_site, find_area))
    
##### Is Calgary served by area Two?

find_area = "Two"
    
site_idx = df2.query("name == '{}'".format(find_site))["site_id"].values[0]

served_sites = df1.loc[df1["id"] == find_area]["served_sites"].values[0]

if site_idx in served_sites:
    print("{} is served by area {}".format(find_site, find_area))
else:
    print("{} is NOT served by area {}".format(find_site, find_area))

##### Does area Two have any invalid sites?

verify_area = "Two"
verify_served_sites = df1.loc[df1["id"] == verify_area]["served_sites"].values[0]
for s_site in verify_served_sites:
    find_site = df2.loc[df2["site_id"] == s_site]["site_id"]
    if find_site.empty:
        print("Area {} has an invalid site id {}".format(verify_area, s_site))
        
##### Is Boston a site?
find_site = "Boston"
found_id = df2.query("name == '{}'".format(find_site))
if found_id.empty:
    print("{} is NOT a site".format(find_site))
else:
    print("{} is a site".format(find_site))