Python Forum

Full Version: Spreadsheet to dictionary
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
There are probably many ways to convert a spreadsheet to a python dictionary, here is another one. It uses Libreoffice's soffice command to convert the spreadsheet to xlsx format if necessary (in a temporary directory), then the openpyxl module to read it. Enjoy! Hand This code may work with .ods, .xlsx, .csv, .xls file formats and perhaps others.

EDIT: further development and latest code in this Gist

from collections import namedtuple
import pathlib
import pickle
import subprocess as sp
import tempfile
__version__ = '2019.01.16.1'
WorksheetKey = namedtuple('WorksheetKey', "index name")

def spreadsheet_to_dict(filename, cachename=None):
    """Return a dict: {(index, name): <list of tuples>}
    
    Arguments
        filename: a path to a .ods or .xlsx file. It may also work
                with .csv and .xls and others

        cachename: if not None, a path to a pickle file where the
                python dictionary will be stored. If this file exists
                and its modification time is more recent than that of
                the spreadsheet file, it is read directly instead of
                the spreadsheet.
                
    Worksheet keys are namedtuples with members .index and .name

    see also: libreoffice, openpyxl
    """
    filename = pathlib.Path(filename)
    if cachename:
        cachename = pathlib.Path(cachename)
        if cachename.is_file() and (
            cachename.stat().st_mtime > filename.stat().st_mtime):
            return load_cache(cachename)
    if filename.suffix == '.xlsx':
        db = {}
        from openpyxl.reader.excel import load_workbook
        wb=load_workbook(str(filename))
        for i, (ws, name) in enumerate(zip(wb.worksheets, wb.sheetnames)):
            db[WorksheetKey(i, name)] = list(ws.values)
        if cachename:
            dump_cache(db, cachename)
        return db        
    with tempfile.TemporaryDirectory() as tmpd:
        tmp = pathlib.Path(tmpd)
        userdir = tmp/'.user'
        userdir.mkdir()
        convert_to_xlsx = [
            'soffice',
            # avoid possible interaction with running instance of libreoffice
            '-env:UserInstallation={}'.format(userdir.as_uri()),
            '--headless',
            '--convert-to',
            # 'xlsx:"Calc MS Excel 2007 XML"', # <- doesn't work
            # 'xlsx', # <- works
            'xlsx:Calc MS Excel 2007 XML', # <- also works
            '--outdir',
            str(tmp),
            str(filename)]
        sp.run(convert_to_xlsx, check=True, stdout=sp.DEVNULL)
        return spreadsheet_to_dict(
            (tmp/filename.name).with_suffix('.xlsx'), cachename=cachename)
    
def load_cache(cachename):
    with cachename.open('rb') as ifh:
        return pickle.load(ifh)

def dump_cache(obj, cachename):
    try:
        exc_occurred = False
        try:
            with cachename.open('wb') as ofh:
                pickle.dump(obj, ofh)
        except Exception:
            exc_occurred = True
            raise
    finally:
        if exc_occurred:
            try:
                cachename.unlink()
            except OSError:
                pass


if __name__ == '__main__':
    with tempfile.TemporaryDirectory() as tmp:
        fn = pathlib.Path(tmp)/'tmp_example.csv'
        with fn.open('w') as ofh:
            print("""\
Spam,maps,32
Eggs,sgge,64
Bacon,nocab,128""", file=ofh)
        d = spreadsheet_to_dict(str(fn))
    print(d)
I upgraded the above post by adding a 'cachename' optional argument. It is useful if you run your program several times and need to load the same spreadsheet each time. Instead of calling libreoffice and openpyxl several times, the resulting dictionary can be stored in a pickle file and loaded directly from here. As long as the spreadsheet is not modified, the pickle file won't be rewritten. So you would use it this way
thedict = spreadsheet_to_dict('the-spreadsheet.ods', cachename='the-cache.pkl')
The conversion will occur only the first time the program is run.