Posts: 61
Threads: 26
Joined: Feb 2023
I have a rather large, complicated dictionary structure that I need to duplicate in a Snowflake SQL database. I know that I can iterate through each dictionary and INSERT the data one row at a time, but that will be a lot of work and won't be very efficient.
I would like to know if there is a way that I could use my dictionary structure to create tables and relate them to each other.
Sample data:
{"provinces": {"province":"Ontario",
"cities": [
{"city":"Toronto",
"population": 2930000,
"districts": [
{"district": "East York"},
{"district": "North York"},
{"district": "Scarborough"},
],
},
{"city":"Ottawa",
"population": 994837,
"districts": [
{"district": "Centretown"},
{"district": "Golden Triangle"},
{"district": "Sandy Hill"},
],
},
],
"lakes": [
{"lake": "Lake Erie"},
{"lake": "Lake Simcoe"},
],
},
{"province": "Alberta",
"cities": [
{"city":"Edmonton",
"population":981280,
"districts": [
{"district": "Ellerslie"},
{"district": "Whitemud"},
],
},
"lakes": [
{"lake": "Lake Louise"},
{"lake": "Moraine Lake"},
{"lake": "Bow Lake"},
],
},
{"province": "Saskatchewan",
"cities": [
{"city": "Saskatoon",
"population": 273010,
"districts": [
{"district": "Holmwood"},
{"district": "Lakewood"},
],
},
],
}
} The desired SQL tables that I would want to generate from that data:
CREATE TABLE provinces (prov_pk INT AUTOINCREMENT,
province STRING,
note STRING,
PRIMARY KEY (prov_pk)
)
CREATE TABLE cities (city_pk INT AUTOINCREMENT,
prov_key INT NOT NULL,
city STRING NOT NULL,
capital STRING NOT NULL,
population INT NOT NULL,
PRIMARY KEY (city_pk),
FOREIGN KEY (prov_key) REFERENCES provinces(prov_pk)
)
CREATE TABLE districts (district_key INT AUTOINCREMENT,
city_key INT,
district STRING,
PRIMARY KEY (district_key),
FOREIGN KEY (city_key) REFERENCES cities(city_pk)
)
CREATE TABLE lakes (lake_key INT AUTOINCREMENT,
prov_key INT NOT NULL,
lake STRING NOT NULL,
PRIMARY KEY (lake_key),
FOREIGN KEY (prov_key) REFERENCES provinces(prov_pk) Is there a module or something else that could handle this?
Posts: 4,780
Threads: 76
Joined: Jan 2018
Sep-26-2024, 08:24 AM
(This post was last modified: Sep-26-2024, 08:24 AM by Gribouillis.)
Please post syntactically correct code. I had to fix the dictionary!
You could start by flattening the structure, here is code that does that
data = {
"provinces": [
{
"province": "Ontario",
"cities": [
{
"city": "Toronto",
"population": 2930000,
"districts": [
{"district": "East York"},
{"district": "North York"},
{"district": "Scarborough"},
],
},
{
"city": "Ottawa",
"population": 994837,
"districts": [
{"district": "Centretown"},
{"district": "Golden Triangle"},
{"district": "Sandy Hill"},
],
},
],
"lakes": [
{"lake": "Lake Erie"},
{"lake": "Lake Simcoe"},
],
},
{
"province": "Alberta",
"cities": [
{
"city": "Edmonton",
"population": 981280,
"districts": [
{"district": "Ellerslie"},
{"district": "Whitemud"},
],
}
],
"lakes": [
{"lake": "Lake Louise"},
{"lake": "Moraine Lake"},
{"lake": "Bow Lake"},
],
},
{
"province": "Saskatchewan",
"cities": [
{
"city": "Saskatoon",
"population": 273010,
"districts": [
{"district": "Holmwood"},
{"district": "Lakewood"},
],
},
],
},
]
}
class Parser:
def __init__(self):
self.rows = []
def handle_dict(self, dic, parent_num):
num = len(self.rows)
self.rows.append(obj := {})
for key, value in dic.items():
if isinstance(value, list):
for item in value:
self.handle_dict(item, num)
else:
obj[key] = value
obj['parent'] = parent_num
def main():
p = Parser()
p.handle_dict(data, -1)
for i, x in enumerate(p.rows):
print(i, x)
if __name__ == "__main__":
main() Output: 0 {'parent': -1}
1 {'province': 'Ontario', 'parent': 0}
2 {'city': 'Toronto', 'population': 2930000, 'parent': 1}
3 {'district': 'East York', 'parent': 2}
4 {'district': 'North York', 'parent': 2}
5 {'district': 'Scarborough', 'parent': 2}
6 {'city': 'Ottawa', 'population': 994837, 'parent': 1}
7 {'district': 'Centretown', 'parent': 6}
8 {'district': 'Golden Triangle', 'parent': 6}
9 {'district': 'Sandy Hill', 'parent': 6}
10 {'lake': 'Lake Erie', 'parent': 1}
11 {'lake': 'Lake Simcoe', 'parent': 1}
12 {'province': 'Alberta', 'parent': 0}
13 {'city': 'Edmonton', 'population': 981280, 'parent': 12}
14 {'district': 'Ellerslie', 'parent': 13}
15 {'district': 'Whitemud', 'parent': 13}
16 {'lake': 'Lake Louise', 'parent': 12}
17 {'lake': 'Moraine Lake', 'parent': 12}
18 {'lake': 'Bow Lake', 'parent': 12}
19 {'province': 'Saskatchewan', 'parent': 0}
20 {'city': 'Saskatoon', 'population': 273010, 'parent': 19}
21 {'district': 'Holmwood', 'parent': 20}
22 {'district': 'Lakewood', 'parent': 20}
« We can solve any problem by introducing an extra level of indirection »
Posts: 4,780
Threads: 76
Joined: Jan 2018
I added more code to plan the building of the tables
class Parser:
def __init__(self):
self.rows = []
def handle_dict(self, dic, parent_num):
num = len(self.rows)
self.rows.append(obj := {})
for key, value in dic.items():
if isinstance(value, list):
for item in value:
self.handle_dict(item, num)
else:
obj[key] = value
obj['parent'] = parent_num
def plan_tables(self):
pos = [None]
self.tables = {}
for i, obj in enumerate(self.rows[1:], 1):
name = next(iter(obj))
if not name in self.tables:
self.tables[name] = []
t = self.tables[name]
j = len(t)
pos.append((name, j))
t.append(obj)
if obj['parent']:
obj['parent'] = pos[obj['parent']]
else:
del obj['parent']
def main():
p = Parser()
p.handle_dict(data, -1)
for i, x in enumerate(p.rows):
print(i, x)
print('==========================')
p.plan_tables()
for name, table in p.tables.items():
print(f'TABLE {name}:')
for x in table:
print(x)
if __name__ == "__main__":
main() Output: ==========================
TABLE province:
{'province': 'Ontario'}
{'province': 'Alberta'}
{'province': 'Saskatchewan'}
TABLE city:
{'city': 'Toronto', 'population': 2930000, 'parent': ('province', 0)}
{'city': 'Ottawa', 'population': 994837, 'parent': ('province', 0)}
{'city': 'Edmonton', 'population': 981280, 'parent': ('province', 1)}
{'city': 'Saskatoon', 'population': 273010, 'parent': ('province', 2)}
TABLE district:
{'district': 'East York', 'parent': ('city', 0)}
{'district': 'North York', 'parent': ('city', 0)}
{'district': 'Scarborough', 'parent': ('city', 0)}
{'district': 'Centretown', 'parent': ('city', 1)}
{'district': 'Golden Triangle', 'parent': ('city', 1)}
{'district': 'Sandy Hill', 'parent': ('city', 1)}
{'district': 'Ellerslie', 'parent': ('city', 2)}
{'district': 'Whitemud', 'parent': ('city', 2)}
{'district': 'Holmwood', 'parent': ('city', 3)}
{'district': 'Lakewood', 'parent': ('city', 3)}
TABLE lake:
{'lake': 'Lake Erie', 'parent': ('province', 0)}
{'lake': 'Lake Simcoe', 'parent': ('province', 0)}
{'lake': 'Lake Louise', 'parent': ('province', 1)}
{'lake': 'Moraine Lake', 'parent': ('province', 1)}
{'lake': 'Bow Lake', 'parent': ('province', 1)}
Pedroski55 and Calab like this post
« We can solve any problem by introducing an extra level of indirection »
Posts: 1,089
Threads: 143
Joined: Jul 2017
Oct-05-2024, 04:27 AM
(This post was last modified: Oct-05-2024, 04:27 AM by Pedroski55.)
I like Gribouillis' solution, but I have never made any classes, so I don't really understand them.
The first problem was to correct the bad dictionary you posted. That took me a while!
Thereafter, you just have: dictionary[key]: list . No module or extra stuff is needed to deal with the data.
I use MySQL and phpadmin to access the database. I don't know anything about Snowflake SQL. Must be cold Canadian!
For doing things with or to a database from Python, I use pymysql.
import pymysql
# this data is ok, finally!
mydict = {"provinces": [
{"province": "Ontario",
"cities": [{"city":"Toronto",
"population": 2930000,
"districts": [{"district": "East York"}, {"district": "North York"}, {"district": "Scarborough"}]},
{"city":"Ottawa",
"population": 994837,
"districts": [{"district": "Centretown"}, {"district": "Golden Triangle"}, {"district": "Sandy Hill"}]}],
"lakes": [{"lake": "Lake Erie"}, {"lake": "Lake Simcoe"}]},
{"province": "Alberta",
"cities": [{"city":"Edmonton",
"population":981280,
"districts": [{"district": "Ellerslie"}, {"district": "Whitemud"}]}],
"lakes": [{"lake": "Lake Louise"}, {"lake": "Moraine Lake"}, {"lake": "Bow Lake"}]},
{"province": "Saskatchewan",
"cities": [{"city": "Saskatoon",
"population": 273010,
"districts": [{"district": "Holmwood"}, {"district": "Lakewood"}]}],
"lakes": [{"lake": "Lake Skat"}, {"lake": "Lake Chewan"}]}]}
# how to get all the data you need from mydict
print(type(mydict['provinces']) # <class 'list'>
for p in mydict['provinces']:
print(f"**********{p['province']}**********")
for c in p['cities']:
print(f"++++++{c['city']}++++++")
print(c['population'])
for d in c['districts']:
print(f"----{d['district']}----")
for l in p['lakes']:
print(f"____{l['lake']}____")
# first you need MySQL installed and running with a privileged user
# connect to MySQL and create a new db
connect_db = pymysql.connect(host='localhost',
user='pedro',
password='topsecret')
# make a database called canada
with connect_db.cursor() as cursor:
cursor.execute('CREATE DATABASE IF NOT EXISTS canada')
connect_db.close()
# for connecting to the db canada, close after using
connect_canada_db = pymysql.connect(host='localhost',
user='pedro',
#port=''
password='topsecret',
db='canada',
cursorclass=pymysql.cursors.DictCursor)
# now make the tables
# first provinces
# only have 3 provinces, number keys: Ontario = 1, Alberta = 2, Saskatchewan = 3 need to pass these when inserting data
# the other provinces have declared independence
with connect_canada_db.cursor() as cursor:
make_table = """CREATE TABLE IF NOT EXISTS provinces (prov_pk INT AUTO_INCREMENT PRIMARY KEY NOT NULL,
province VARCHAR(50),
note VARCHAR(50)) DEFAULT CHARACTER SET UTF8MB4 ENGINE=InnoDB;"""
cursor.execute(make_table)
auto_inc = 'ALTER TABLE provinces AUTO_INCREMENT=1;'
cursor.execute(auto_inc)
make_table = """CREATE TABLE IF NOT EXISTS cities (city_pk INT AUTO_INCREMENT PRIMARY KEY NOT NULL,
prov_key INT NOT NULL,
city VARCHAR(50) NOT NULL DEFAULT 'Gotham',
capital VARCHAR(50) NOT NULL DEFAULT 'Gotham',
population INT NOT NULL DEFAULT 0,
FOREIGN KEY (prov_key) REFERENCES provinces(prov_pk)) DEFAULT CHARACTER SET UTF8MB4 ENGINE=InnoDB;"""
cursor.execute(make_table)
auto_inc = 'ALTER TABLE cities AUTO_INCREMENT=1;'
cursor.execute(auto_inc)
make_table = """CREATE TABLE districts (district_key INT AUTO_INCREMENT PRIMARY KEY NOT NULL,
city_key INT NOT NULL,
district VARCHAR(50),
FOREIGN KEY (city_key) REFERENCES cities(city_pk)) DEFAULT CHARACTER SET UTF8MB4 ENGINE=InnoDB;"""
cursor.execute(make_table)
auto_inc = 'ALTER TABLE districts AUTO_INCREMENT=1;'
cursor.execute(auto_inc)
make_table = """CREATE TABLE lakes (lake_key INT AUTO_INCREMENT PRIMARY KEY NOT NULL,
prov_key INT NOT NULL,
lake VARCHAR(50) NOT NULL,
FOREIGN KEY (prov_key) REFERENCES provinces(prov_pk)) DEFAULT CHARACTER SET UTF8MB4 ENGINE=InnoDB;"""
cursor.execute(make_table)
auto_inc = 'ALTER TABLE lakes AUTO_INCREMENT=1;'
cursor.execute(auto_inc)
# now populate the tables
# first provinces
with connect_canada_db.cursor() as cursor:
for p in mydict['provinces']:
print(p['province']) # shows the 3 remaining provinces of once proud Canada
if p['province'] == 'Ontario':
remark = 'backwards'
elif p['province'] == 'Alberta':
remark = 'Let your hair hang down'
elif p['province'] == 'Saskatchewan':
remark = 'Freezing'
tup = (p['province'], remark)
insert_sql = "INSERT INTO provinces (province, note) VALUES (%s, %s)"
cursor.execute(insert_sql, tup)
connect_canada_db.commit()
# second cities
with connect_canada_db.cursor() as cursor:
for p in mydict['provinces']:
print(p['province']) # shows the 3 remaining provinces of once proud Canada
if p['province'] == 'Ontario':
fk = 1
elif p['province'] == 'Alberta':
fk = 2
elif p['province'] == 'Saskatchewan':
fk = 3
for c in p['cities']:
print(c['city'])
tup = (fk, c['city'], c['population'] )
insert_sql = "INSERT INTO cities (prov_key, city, population) VALUES (%s, %s, %s)"
cursor.execute(insert_sql, tup)
connect_canada_db.commit()
# third districts
with connect_canada_db.cursor() as cursor:
for p in mydict['provinces']:
print(p['province']) # shows the 3 remaining provinces of once proud Canada
for c in p['cities']:
print(c['city'])
if c['city'] == 'Toronto':
fk = 1
elif c['city'] == 'Ottawa':
fk = 2
elif c['city'] == 'Edmonton':
fk = 3
elif c['city'] == 'Saskatoon':
fk = 4
for d in c['districts']:
print(d['district'])
tup = (fk, d['district'])
insert_sql = "INSERT INTO districts (city_key, district) VALUES (%s, %s)"
cursor.execute(insert_sql, tup)
connect_canada_db.commit()
# fourth lakes
with connect_canada_db.cursor() as cursor:
for p in mydict['provinces']:
print(p['province']) # shows the 3 remaining provinces of once proud Canada
if p['province'] == 'Ontario':
fk = 1
elif p['province'] == 'Alberta':
fk = 2
elif p['province'] == 'Saskatchewan':
fk = 3
for l in p['lakes']:
print(l['lake'])
tup = (fk, l['lake'])
insert_sql = "INSERT INTO lakes (prov_key, lake) VALUES (%s, %s)"
cursor.execute(insert_sql, tup)
connect_canada_db.commit()
# close the connection
connect_canada_db.close()
print("All done, everything looks good in phpmyadmin!") Personally, I would use the names of the provinces as the FOREIGN KEY, otherwise you need to remember which number belongs to which province.
I find, the easiest way to populate a table is import a csv using phpadmin.
|