Python Forum

Full Version: os.rename Windows remove illegal char
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I need help to understand the best partice to remove illegal char from file and path name.

Windowz (FAT32, NTFS): Any Unicode except NUL, \, /, :, *, ", <, >, |

# import libraries
import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], title+ext)
print old_file
print new_file
os.rename(old_file,new_file)#??????? 
os.remove(old_file)

I found my way Dance

# import libraries

import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

#clean string loop
ctitle = title
illegal = ['NUL','\',''//',':','*','"','<','>','|']

for i in illegal:
    ctitle = ctitle.replace(i, '')

print(ctitle)

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], ctitle+ext)
print old_file
print new_file
os.rename(old_file,new_file)
os.remove(old_file)
You can use regex substitute method to replace

illegal = ['NUL','\',''//',':','*','"','<','>','|']
 
for i in illegal:
    ctitle = ctitle.replace(i, '')
import re
ctitle = re.sub(r'[\\/\:*"<>\|\.%\$\^&£]', '', ctitle)
Nice thx
Good that you found an solution,here an other way.
Some advice,should always use Requests.
You see the yellow banner Wink
So this code is for python 3.6 and work on Windows, Python 3.6 and pip installation under Windows
import requests
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
# Test other url
#url = 'http://www.imdb.com/title/tt5294550/?ref_=inth_ov_i'
#url = 'http://www.imdb.com/title/tt5726086/?ref_=inth_ov_tt'

url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
image = soup.find('div', class_="poster")
image_name = image.find('img').get('alt').replace(':', '')
image_url = image.find('img').get('src')

# Download
response = requests.get(image_url)
with open(f'{image_name}.jpg', 'wb') as f:
        f.write(response.content)