Python Forum

Full Version: Invalid Date Format fo Cached Files
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
In this python code I am attempting to extract files from a FTP folder and then search through them for an invoice#. I have the command window open when running and keep getting stuck on the part where the code is trying to strip the date out of the file name, I get the error "Invalid Date Format" and it states it is skipping that file. The folder contains a ton of files and I really only need to look at the last 30 days worth of files. We have another python code that is doing something similar but with a different file name layout and I used that code to try to replicate it in this instance. This is what the files look like on the ftp folder, attached. This is the beginning of the code that I am getting stuck on will see about the search part once I figure this out;
import os
import datetime
from ftplib import FTP
import csv
from tkinter import *
from tkinter import messagebox
import tkinter as tk

# FTP and path configurations
outputPath = r'\\xxxxxx\xxxxxxxxxx\SanMar Invoice'
cacheDir = 'C:/temp/SanMarRoi/cache'
cacheSize = 60
ftp_site = "xxxxxxxxx"
ftp_username = "xxxxxxxx"
ftp_password = "xxxxxxxx"

# Ensure directories exist
if not os.path.exists(cacheDir):
    os.makedirs(cacheDir)
if not os.path.exists(outputPath):
    os.makedirs(outputPath)

rows = []
invoice = set()
ponumber = ""

# Function to fetch files from FTP and sync cache
def sync_cache():
    ftp = FTP(ftp_site)
    ftp.login(user=ftp_username, passwd=ftp_password)
    ftp.cwd("Outbound")

    filenames = []
    ftp.retrlines("LIST", lambda line: filenames.append(line.split()[-1]))
    
    print("Files retrieved from FTP server:")
    for file in filenames:
        print(file)

    # Get current date
    now = datetime.datetime.now()
    
    valid_filenames = []
    for file in filenames:
        try:
            # Extract the date from the filename
            file_date_str = file.split('-')[-1].split('.')[0]  # Get the last part of the filename and remove extension
            file_date = datetime.datetime.strptime(file_date_str, "%m-%d-%y")
            valid_filenames.append((file, file_date))
        except (ValueError, IndexError):
            # If parsing fails, skip the file
            print(f"Skipping file {file}: Invalid date format")

    # Filter to include only files from the last 30 days
    recent_files = [file for file in valid_filenames if (now - file[1]).days <= 30]
    
    print("Recent files from the last 30 days:")
    for file in recent_files:
        print(file[0])

    recent_files.sort(key=lambda filename: filename[1], reverse=True)

    print("Syncing cache. Please wait...")
    for i, (filename, _) in enumerate(recent_files):
        if i == cacheSize:
            break
        local_path = os.path.join(cacheDir, filename)
        if os.path.exists(local_path):
            print(f"File already in cache: {filename}")
            continue
        with open(local_path, "w") as cacheFile:
            ftp.retrbinary(f"RETR {filename}", lambda data: cacheFile.write(data.decode("utf-8")))
        print(f"Downloaded and cached file: {filename}")
    print("Cache sync complete")
    return [file[0] for file in recent_files]
"64727_Inventory_Details-05-02-24.txt". split("-") returns ["64727_Inventory_Details", "05", "02", "24.txt"]. Probably not what you expect, but easily learned if you think about it.

I would start solving this problem like this:
filename = "64727_Inventory_Details-05-02-24.txt"
print(filename.split("-"))
Output:
['64727_Inventory_Details', '05', '02', '24.txt']
This little program demonstrates why your approach will not work.

You could just do the first split.
filename = "64727_Inventory_Details-05-02-24.txt"
print(filename.split("-", maxsplit=1))
Output:
['64727_Inventory_Details', '05-02-24.txt']
Your original code should have been written like this:
from datetime import datetime

filenames = (
    "64727_Inventory_Details-05-02-24.txt",
    "64728_Inventory_Details-05-04-24.txt",
    "64728_Inventory_Details.txt",
)

valid_files = []
for name in filenames:
    try:
        datestr = name.split("-", maxsplit=1)[1].split(".")[0]
        valid_files.append((datetime.strptime(datestr, "%m-%d-%y").date(), name))
    except (ValueError, IndexError):
        pass
print(*valid_files, sep="\n")
Output:
(datetime.date(2024, 5, 2), '64727_Inventory_Details-05-02-24.txt') (datetime.date(2024, 5, 4), '64728_Inventory_Details-05-04-24.txt')
Notice how my example includes sample filenames. If you want help, make it easy for others to help you. Don't make them transcribe filenames from a fuzzy screenshot.

This approach works, but I think I'd try a different approach using pattern matching.
import re
from datetime import datetime


filenames = (
    "64727_Inventory_Details-05-02-24.txt",
    "64728_Inventory_Details-05-04-24.txt",
    "64728_Inventory_Details.txt",
)
pattern = ".*(\d+-\d+-\d+).txt"
valid_files = []
for name in filenames:
    try:
        datestr = re.match(pattern, name).group(1)
        valid_files.append((datetime.strptime(datestr, "%m-%d-%y").date(), name))
    except (ValueError, AttributeError):
        pass
print(*valid_files, sep="\n")
Results are the same.