os.rename Windows remove illegal char - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: os.rename Windows remove illegal char (/thread-7301.html) |
os.rename Windows remove illegal char - fgerrata - Jan-03-2018 I need help to understand the best partice to remove illegal char from file and path name. Windowz (FAT32, NTFS): Any Unicode except NUL, \, /, :, *, ", <, >, | # import libraries import urllib2 from bs4 import BeautifulSoup import urlparse, os # get name movie spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm' page = urllib2.urlopen(spage) soup = BeautifulSoup(page, 'html.parser') movie = soup.find('h1', attrs={'itemprop':'name'}) title = movie.get_text(strip=True) print title # get cover movie cover = soup.find(attrs={"class" : "poster"}) cover_url = (cover.find('img'))['src'] img = urllib2.urlopen(cover_url) a = urlparse.urlparse(cover_url) a.path imgn = os.path.basename(a.path) localFile = open(imgn, 'wb') localFile.write(img.read()) localFile.close() #get ext ext = os.path.splitext(imgn)[1] #get path & file name x = os.path.split(os.path.abspath(imgn)) print x[0] print x[1] #rename file old_file = os.path.join(x[0], x[1]) new_file = os.path.join(x[0], title+ext) print old_file print new_file os.rename(old_file,new_file)#??????? os.remove(old_file) I found my way # import libraries import urllib2 from bs4 import BeautifulSoup import urlparse, os # get name movie spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm' page = urllib2.urlopen(spage) soup = BeautifulSoup(page, 'html.parser') movie = soup.find('h1', attrs={'itemprop':'name'}) title = movie.get_text(strip=True) print title #clean string loop ctitle = title illegal = ['NUL','\',''//',':','*','"','<','>','|'] for i in illegal: ctitle = ctitle.replace(i, '') print(ctitle) # get cover movie cover = soup.find(attrs={"class" : "poster"}) cover_url = (cover.find('img'))['src'] img = urllib2.urlopen(cover_url) a = urlparse.urlparse(cover_url) a.path imgn = os.path.basename(a.path) localFile = open(imgn, 'wb') localFile.write(img.read()) localFile.close() #get ext ext = os.path.splitext(imgn)[1] #get path & file name x = os.path.split(os.path.abspath(imgn)) print x[0] print x[1] #rename file old_file = os.path.join(x[0], x[1]) new_file = os.path.join(x[0], ctitle+ext) print old_file print new_file os.rename(old_file,new_file) os.remove(old_file) RE: os.rename Windows remove illegal char - hshivaraj - Jan-03-2018 You can use regex substitute method to replace illegal = ['NUL','\',''//',':','*','"','<','>','|'] for i in illegal: ctitle = ctitle.replace(i, '') import re ctitle = re.sub(r'[\\/\:*"<>\|\.%\$\^&£]', '', ctitle) RE: os.rename Windows remove illegal char - fgerrata - Jan-03-2018 Nice thx RE: os.rename Windows remove illegal char - snippsat - Jan-03-2018 Good that you found an solution,here an other way. Some advice,should always use Requests. You see the yellow banner So this code is for python 3.6 and work on Windows, Python 3.6 and pip installation under Windows import requests from bs4 import BeautifulSoup url = 'http://www.imdb.com/title/tt2527336/?ref_=rlm' # Test other url #url = 'http://www.imdb.com/title/tt5294550/?ref_=inth_ov_i' #url = 'http://www.imdb.com/title/tt5726086/?ref_=inth_ov_tt' url_get = requests.get(url) soup = BeautifulSoup(url_get.content, 'lxml') image = soup.find('div', class_="poster") image_name = image.find('img').get('alt').replace(':', '') image_url = image.find('img').get('src') # Download response = requests.get(image_url) with open(f'{image_name}.jpg', 'wb') as f: f.write(response.content) |