Python Forum

Hello everybody. I work in a project where we need to transfer a huge amount of data in ArangoDB. The data that need to be transferred are in the form of xlsx files with multiple worksheets per file. I wrote a script that converts the xlsx files in to json files and then connects to the database and imports the data. When the script is ran in windows it works like a charm. No warnings, no errors, nothing. After I finished the coding I moved the script to my Virtual Server running CentOS 7 and installed the latest version of ArangoDB. Every time I tried to run the script it crashed giving "Segemtation fault (core dumped)" error if I ran it from command line and "Kernel restarting. The kernel appears to have died. It will restart automatically" if I ran it in Jupyter Notebook. I tested the script also in a physical machine running Ubuntu 19.10 and had the same exact result. All machines were running Anaconada3.2020 and were fully updated. For the coding I used Jupyter Notebook and Atom. My code is this:

from zipfile import ZipFile
from bs4 import BeautifulSoup
import pandas as pd
from xlsx2csv import Xlsx2csv as x2csv
import os
import hashlib
import json
import numpy as np
from arango import ArangoClient
import glob

filelist = []
hash_dict = {}
current_folder = os.getcwd()
for file in os.listdir(current_folder):
    if file.endswith(".xlsx"):
        filelist.append(file)

#create a list of all worksheets contained in the worksheet
def create_sheet_list(file):
    with ZipFile(file) as zipped_file:
        summary = zipped_file.open(r'xl/workbook.xml').read()
    soup = BeautifulSoup(summary, "xml")
    sheets = [sheet.get("name") for sheet in soup.find_all("sheet")]
    return sheets

#create an array of dataframes from all the worksheets
def create_dataframes(file):
    xl = pd.ExcelFile(file)
    xl.sheet_names
    dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
    return dfs

def create_json(file,sheets,dfs):
    print(("The file contains {} sheets").format(len(sheets)))
    count = 0
    for i in sheets:
        json_filelist = []
        count = count + 1
#produce the dataframe and check if there are any encoding errors
        try:
            df = dfs[i]
            new_header = df.iloc[0]
            df = df[1:]
            df.columns = new_header
            df = df.fillna(0)
            hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
            hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
            values = str(hash_str)
            df['Hash']=np.nan
            df['Hash']= df['Hash'].fillna(value=values)
            #hash_dict.update({hash_str_name : values})
            hash_dict[hash_str_name] = values
            json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
#For the dataframes that will get an error because of encoding a different way of conversion will be used
        except UnicodeEncodeError:
            x2csv(file, outputencoding="utf-8").convert(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')),count)
            df = pd.read_csv(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')), header = 1)
            hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
            hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
            values = str(hash_str)
            df['Hash']=np.nan
            df['Hash']= df['Hash'].fillna(value=values)
            #hash_dict.update({hash_str_name : values})
            hash_dict[hash_str_name] = values
            json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
            os.remove(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')))

#Create connection with the Database
def create_db_connection():
    client = ArangoClient(hosts='http://127.0.0.1:8529')
    db = client.db('CEM', username='root', password='123456')
    return db

#Get the list of the .json files from all the folders
def list_of_json():
    path =  os.getcwd()
    folders = os.listdir(path)
    json_names = []
    for folder in folders:
        files = glob.glob(path+"/"+folder+"/"+"*.json")
        if len(files)>0:
            json_names.append(files)
    return json_names

#Get the list of the collections in the database
def list_of_collections(sheets,db):
    for col in sheets:
        col = col.replace(' ','_')
        if db.has_collection(col):
            collect = db.collection(col)
        else:
            collect = db.create_collection(col)
    collection = db.collections()
    collection = [i['name'] for i in collection if i['name'][0].isupper()]
    return collection

#Import the data from the .json files to the appropriate collections
def import_data(json_names,collection, db):
    for x in json_names:
        for y in x:
            for z in collection:
                with open(y, "r") as json_file:
                    if y.endswith("{}.json".format(z)):
                        data = json.load(json_file)
                        z = db.collection(z)
                        z.import_bulk(data)

for file in filelist:
    try:
#create the folder where the .json files from that UFED will be stored
        new_folder = os.mkdir(os.getcwd()+"/"+file.strip('.xlsx'))
#get the path for the new folder
        new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
    except FileExistsError:
#if the folder already exists just get its path
        new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
    print(new_path)
#print the name of the file that's being analyzed so that we have a measure of progress
    print(("Now I am working with {} file").format(file))
#call the functions and run the program
    create_sheet_list(file)
    create_dataframes(file)
    sheets = create_sheet_list(file)
    dfs = create_dataframes(file)
    create_json(file,sheets,dfs)
df_dict = pd.DataFrame(list(hash_dict.items()), index = None, columns = ["File_name", "Hash_num"])
df_dict.to_json(current_folder+"/hash_list.json", orient = "records")
create_db_connection()
db = create_db_connection()
#create_collections(sheets,db)
list_of_json()
json_names = list_of_json()
list_of_collections(sheets,db)
collection = list_of_collections(sheets,db)
import_data(json_names,collection,db)

Last thing to mention is that most of the .xlsx files are pretty big, (10 - 25 MB) and contain at least 25 Worksheets, many of them with hundreds or thousands of lines.
The windows machine is a laptop with an i7-7700 and 16 GB of RAM, the VM CentOS 7 server is a VM with 4 processors and 32 GB of RAM and the ubuntu 19.10 Machine is a laptop with i7-9750H 16 GB of RAM and NVMe boot and swap disk(16 GB).
Can anyone help?

which python version do you run it with? on both windows and linux (centos and ubuntu)

(Apr-28-2020, 05:56 AM)buran Wrote: [ -> ]which python version do you run it with? on both windows and linux (centos and ubuntu)

The python version used is python 3.7.4 in both linux distros and windows. In all 3 systems everything concerning python is the same. All three systems have Anaconda3.2020 the latest edition form their site, both linux distro's were updated prior the installation of anaconda and after.

ovidius80

buran

ovidius80