Python Forum

I need help to understand the best partice to remove illegal char from file and path name.

Windowz (FAT32, NTFS): Any Unicode except NUL, \, /, :, *, ", <, >, |

# import libraries
import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], title+ext)
print old_file
print new_file
os.rename(old_file,new_file)#??????? 
os.remove(old_file)

I found my way Dance

# import libraries

import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

#clean string loop
ctitle = title
illegal = ['NUL','\',''//',':','*','"','<','>','|']

for i in illegal:
    ctitle = ctitle.replace(i, '')

print(ctitle)

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], ctitle+ext)
print old_file
print new_file
os.rename(old_file,new_file)
os.remove(old_file)

You can use regex substitute method to replace

illegal = ['NUL','\',''//',':','*','"','<','>','|']
 
for i in illegal:
    ctitle = ctitle.replace(i, '')

import re
ctitle = re.sub(r'[\\/\:*"<>\|\.%\$\^&£]', '', ctitle)

Nice thx

Good that you found an solution,here an other way.
Some advice,should always use Requests.
You see the yellow banner Wink

So this code is for python 3.6 and work on Windows, Python 3.6 and pip installation under Windows

import requests
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
# Test other url
#url = 'http://www.imdb.com/title/tt5294550/?ref_=inth_ov_i'
#url = 'http://www.imdb.com/title/tt5726086/?ref_=inth_ov_tt'

url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
image = soup.find('div', class_="poster")
image_name = image.find('img').get('alt').replace(':', '')
image_url = image.find('img').get('src')

# Download
response = requests.get(image_url)
with open(f'{image_name}.jpg', 'wb') as f:
        f.write(response.content)

fgerrata

hshivaraj

fgerrata

snippsat