Python Forum

Hi everyone,
following the docx documentation and dropbox API documentation I wanted to download doc and docx files from a dropbox directory. I used this function which I adapted to return a pandas dataframe:

def download_file(dbx, path):
##    path = '/%s/%s/%s' % (folder, subfolder.replace(os.path.sep, '/'), name)
##    dest = '/%s/%s' % (folder, subfolder.replace(os.path.sep, '/'))
    while '//' in path:
        path = path.replace('//', '/')
##    while '//' in dest:
##        dest = dest.replace('//', '/')
    with stopwatch('download'):
        try:
            md,res = dbx.files_download(path)
        except dropbox.exceptions.HttpError as err:
            #print('*** HTTP error', err)
            return None
        except dropbox.exceptions.ApiError:
            #print('** API error', dropbox.exceptions.ApiError)
            return None
##    with open(res.content, encoding='CP1252') as data_handle:
##    data = res.content
##        s = str(data_handle)
##        s_data = StringIO(s)
    #decode unreadable 0x90 byte
    #with codecs.open(s_data, encoding='iso-8859-1') as s_data_handle:
    try:
        s_data = base64.b64decode(res.content)
    except:
        s_data = ""
    try:
        json_data = json.loads(s_data)
        json_str = json_data.decode('CP1252')
        dataframe = pd.DataFrame(json_str)
        print(dataframe)
    except:
        dataframe = pd.DataFrame()
    #print(len(data))
    return dataframe

You can see that I tried several decoding options because the files have a table and I am having difficulty parsing them and saving the doc files locally and passing them to a dataframe.
In my main() function I call the download_file as such, from a for loop reading directories:

                    res = download_file(dbx, fname)
                    doc = Document()
                    try:
                        t = doc.add_table(res.shape[0] + 1, res.shape[1])
                        #add header row
                        for j in range(res.shape[-1]):
                            t.cell(0, j).text = res.columns[j]
                        #populate rest of table
                        for i in range(res.shape[0]):
                            for j in range(res.shape[-1]):
                                t.cell(i + 1, j).text = str(res.values[i, j])
                                
                    except ValueError:
                        cleaned_str = ''.join(c for c in res.decode('CP1252', errors='ignore') if valid_xml_char_ordinal(c))
                        doc.add_paragraph(cleaned_str)
                        break

At this point I have downloaded the docs files, however they are unreadable by ms word. Is there a need to download the files locally and then reading using again docx tables method:

        doc1 = Document('./' + str(fname).replace('\\','/'))
        table = doc1.tables[0]
        data1 = []
        for i, row in enumerate(table.rows):
            text = (cell.text for cell in row.cells)
            #...

Is there a simpler solution? At this point my objective is to read doc/docx files from dropbox and saving the content of their tables in excel locally.
Thanks for your time

Got an example docx file to work on?

If the files are unreadable, nothing will help!

This gets the tables to pandas:

from docx import Document
import pandas as pd

mydocxfile = '/home/pedro/myPython/docxFiles/example_table2.docx'
for table in Document(mydocxfile).tables:
    data = [[cell.text for cell in row.cells] for row in table.rows]
    print(pd.DataFrame(data[1:], columns=data[0]), "\n")

Gives:

Output:           Name Age     Occupation
0  King Charles  75           King
1         Pedro  55       Layabout
2          Baby  32  Import-Export

iitip92

Pedroski55