Python Forum
os.rename Windows remove illegal char - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: os.rename Windows remove illegal char (/thread-7301.html)



os.rename Windows remove illegal char - fgerrata - Jan-03-2018

I need help to understand the best partice to remove illegal char from file and path name.

Windowz (FAT32, NTFS): Any Unicode except NUL, \, /, :, *, ", <, >, |

# import libraries
import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], title+ext)
print old_file
print new_file
os.rename(old_file,new_file)#??????? 
os.remove(old_file)

I found my way Dance

# import libraries

import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

#clean string loop
ctitle = title
illegal = ['NUL','\',''//',':','*','"','<','>','|']

for i in illegal:
    ctitle = ctitle.replace(i, '')

print(ctitle)

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], ctitle+ext)
print old_file
print new_file
os.rename(old_file,new_file)
os.remove(old_file)



RE: os.rename Windows remove illegal char - hshivaraj - Jan-03-2018

You can use regex substitute method to replace

illegal = ['NUL','\',''//',':','*','"','<','>','|']
 
for i in illegal:
    ctitle = ctitle.replace(i, '')
import re
ctitle = re.sub(r'[\\/\:*"<>\|\.%\$\^&£]', '', ctitle)



RE: os.rename Windows remove illegal char - fgerrata - Jan-03-2018

Nice thx


RE: os.rename Windows remove illegal char - snippsat - Jan-03-2018

Good that you found an solution,here an other way.
Some advice,should always use Requests.
You see the yellow banner Wink
So this code is for python 3.6 and work on Windows, Python 3.6 and pip installation under Windows
import requests
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
# Test other url
#url = 'http://www.imdb.com/title/tt5294550/?ref_=inth_ov_i'
#url = 'http://www.imdb.com/title/tt5726086/?ref_=inth_ov_tt'

url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
image = soup.find('div', class_="poster")
image_name = image.find('img').get('alt').replace(':', '')
image_url = image.find('img').get('src')

# Download
response = requests.get(image_url)
with open(f'{image_name}.jpg', 'wb') as f:
        f.write(response.content)