Python Forum
os.rename Windows remove illegal char
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
os.rename Windows remove illegal char
#1
I need help to understand the best partice to remove illegal char from file and path name.

Windowz (FAT32, NTFS): Any Unicode except NUL, \, /, :, *, ", <, >, |

# import libraries
import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], title+ext)
print old_file
print new_file
os.rename(old_file,new_file)#??????? 
os.remove(old_file)

I found my way Dance

# import libraries

import urllib2
from bs4 import BeautifulSoup
import urlparse, os

# get name movie
spage = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
page = urllib2.urlopen(spage)
soup = BeautifulSoup(page, 'html.parser')
movie = soup.find('h1', attrs={'itemprop':'name'})
title = movie.get_text(strip=True)

print title

#clean string loop
ctitle = title
illegal = ['NUL','\',''//',':','*','"','<','>','|']

for i in illegal:
    ctitle = ctitle.replace(i, '')

print(ctitle)

# get cover movie
cover = soup.find(attrs={"class" : "poster"})
cover_url = (cover.find('img'))['src']
img = urllib2.urlopen(cover_url)
a = urlparse.urlparse(cover_url)
a.path
imgn = os.path.basename(a.path)
localFile = open(imgn, 'wb')
localFile.write(img.read())
localFile.close()

#get ext
ext = os.path.splitext(imgn)[1]

#get path & file name
x =  os.path.split(os.path.abspath(imgn))
print x[0]
print x[1]

#rename file
old_file = os.path.join(x[0], x[1])
new_file = os.path.join(x[0], ctitle+ext)
print old_file
print new_file
os.rename(old_file,new_file)
os.remove(old_file)
Reply
#2
You can use regex substitute method to replace

illegal = ['NUL','\',''//',':','*','"','<','>','|']
 
for i in illegal:
    ctitle = ctitle.replace(i, '')
import re
ctitle = re.sub(r'[\\/\:*"<>\|\.%\$\^&£]', '', ctitle)
Reply
#3
Nice thx
Reply
#4
Good that you found an solution,here an other way.
Some advice,should always use Requests.
You see the yellow banner Wink
So this code is for python 3.6 and work on Windows, Python 3.6 and pip installation under Windows
import requests
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/title/tt2527336/?ref_=rlm'
# Test other url
#url = 'http://www.imdb.com/title/tt5294550/?ref_=inth_ov_i'
#url = 'http://www.imdb.com/title/tt5726086/?ref_=inth_ov_tt'

url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
image = soup.find('div', class_="poster")
image_name = image.find('img').get('alt').replace(':', '')
image_url = image.find('img').get('src')

# Download
response = requests.get(image_url)
with open(f'{image_name}.jpg', 'wb') as f:
        f.write(response.content)
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  expecting value: line 1 column 1 (char 0) in print (r.json)) loutsi 3 7,566 Jun-05-2020, 08:38 PM
Last Post: nuffink

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020