Thanks for the feedback, I'm using windows but I;m at work right now and will following these instructions as soon as I get home and let you know.
OK,
I followed the instructions and install the python interpreter, I then downloaded the python code as a zip file and renamed it zip and placed it on my c drive. I then ran the command line and changed the directory to c:\zip and then pasted the below code, it ran and then return a number of messages. I'll upload them afterward. I'm not sure what next to do though, nor do I know where to copy and paste the URL for the extext book.
#! /usr/bin/env python3
import urllib.parse
import tempfile
import json
import urllib.request
import hashlib
import os
import sys
import time
import re
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.generic import NameObject, DictionaryObject, ArrayObject, NumberObject
from multiprocessing.pool import ThreadPool
language = "en_US"
roletypeid = 2 # 3 for instructor
arabicRegex = re.compile(r"^(?P<prefix>.*?)(\d+)$")
romanRegex = re.compile(r"^(?P<prefix>.*?)((?:(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3})))+)$", re.IGNORECASE)
# Some interesting parts of the books js code:
#
# MD5_SECRET_KEY: "ipadsecuretext"
# Dear Pearson,
# Please don't consider MD5 a "secure" algorithm by any means.
# Sincerely,
# Everybody that looks at your horrifying code
#
# UserRoleType: {
# Student: 2,
# Instructor: 3,
# }
# The above corresponds to the `roletypeid` GET parameter in a lot pf the requests
# Surprisingly, it's not checked at any point to see if, say, a student is impersonating
# a teacher, even though the API throws out an error if it is omited.
#
# Also, since it's there, a good TODO would be to download other types of media along with the PDF.
# Should be relatively simple.
bookInfoUrl = "http://view.ebookplus.pearsoncmg.com/ebook/pdfplayer/getbookinfov2?bookid={}&outputformat=JSON"
pageInfoUrl = "https://view.ebookplus.pearsoncmg.com/ebook/pdfplayer/getpagedetails?userid={userid}&userroleid={userroleid}&bookid={bookid}&bookeditionid={bookeditionid}&authkey={authkey}"
pdfUrl = "https://view.ebookplus.pearsoncmg.com/ebook/pdfplayer/getpdfpage?globalbookid={bookid}&pdfpage={pdfpage}&iscover={iscover}&authkey={authkey}"
bookmarkInfoUrl = "https://view.ebookplus.pearsoncmg.com/ebook/pdfplayer/getbaskettocinfo?userroleid={userroleid}&bookid={bookid}&language={language}&authkey={authkey}&bookeditionid={bookeditionid}&basket=all&scenarioid={scenarioid}&platformid=1001"
def hsidUrl(aUrl):
# Append this url's "hsid" to it (md5 hash of its http url)
md5Hasher = hashlib.new("md5")
md5Hasher.update(b"ipadsecuretext")
md5Hasher.update(aUrl.replace("https://","http://").encode("utf-8"))
return aUrl + "&hsid=" + md5Hasher.hexdigest()
def main(eTextUrl):
bookData = urllib.parse.parse_qs(eTextUrl.split("?")[-1])
if (bookData.get("values", None)) is not None:
bookData = {
itemName : [itemValue] for itemName, itemValue in
zip(*[iter(bookData["values"][0].split("::"))]*2)
}
# A few fixes in terms of capitalization
bookData["bookid"] = bookData["bookID"]
bookData["userid"] = bookData["userID"]
bookData["sessionid"] = bookData["sessionID"]
# We'll default to the roletypeid for a student
bookData["roletypeid"] = [roletypeid] # 3 for Instructor... the server doesn't care, though
print("Downloading metadata and eText information...")
bookInfoGetUrl = bookInfoUrl.format(bookData["bookid"][0])
#print(hsidUrl(bookInfoGetUrl))
with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
str_response = bookInfoRequest.read().decode('utf-8')
bookInfo = json.loads(str_response)
bookInfo = bookInfo[0]['userBookTOList'][0]
pageInfoGetUrl = pageInfoUrl.format(
userid=bookData['userid'][0],
userroleid=bookData['roletypeid'][0],
bookid=bookData['bookid'][0],
bookeditionid=bookInfo['bookEditionID'],
authkey=bookData['sessionid'][0],
)
with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']
def getPageUrl(pdfPage, isCover="N"):
pdfPage = pdfPage.replace("/assets/","")
getPage = pagePath = pdfUrl.format(
bookid=bookInfo['globalBookID'],
pdfpage=pdfPage,
iscover=isCover,
authkey=bookData['sessionid'][0]
)
return hsidUrl(getPage)
with tempfile.TemporaryDirectory() as pdfDownloadDir:
# Use a temporary directory to download all the pdf files to
# First, download the cover file
pdfPageTable = {}
pdfPageLabelTable = {}
urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
# Then, download all the individual pages for the e-book
def download(pdfPage):
pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)
threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly
print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
threadPool.map(download, pageInfo)
print("Assembling PDF...")
# Begin to assemble the final PDF, first by adding all the pages
fileMerger = PdfFileWriter()
for pdfFile in sorted(os.listdir(pdfDownloadDir)):
fileMerger.addPage(PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0))
# And then add all the bookmarks to the final PDF
bookmarkInfoGetUrl = bookmarkInfoUrl.format(
userroleid=bookData['roletypeid'][0],
bookid=bookData['bookid'][0],
language=language,
authkey=bookData['sessionid'][0],
bookeditionid=bookInfo['bookEditionID'],
scenarioid=bookData['scenario'][0],
)
bookmarksExist = True
with urllib.request.urlopen(hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
try:
bookmarkInfo = json.loads(bookmarkInfoRequest.read().decode('utf-8'))
bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
except Exception as e:
bookmarksExist = False
def recursiveSetBookmarks(aDict, parent=None):
if isinstance(aDict, dict):
aDict = [aDict]
for bookmark in aDict:
# These are the main bookmarks under this parent (or the whole document if parent is None)
bookmarkName = bookmark['n'] # Name of the section
pageNum = str(bookmark['lv']['content']) # First page (in the pdf's format)
latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent)
if 'be' in bookmark:
recursiveSetBookmarks(bookmark['be'], latestBookmark)
if bookmarksExist:
print("Adding bookmarks...")
fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be'])
else:
print("Bookmarks don't exist for ID {}".format(bookData['bookid']))
print("Fixing metadata...")
# Hack to fix metadata and page numbers:
pdfPageLabelTable = [(v,k) for k,v in pdfPageTable.items()]
pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0])))
labels = ArrayObject([
NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
])
lastMode = None
lastPrefix = ""
# Now we check to see the ranges where we have roman numerals or arabic numerals
# The following code is not ideal for this, so I'd appreciate a PR with a better solution
for pageNumber, pageLabel in pdfPageLabelTable:
currMode = None
prefix = ""
style = DictionaryObject()
if arabicRegex.match(pageLabel):
currMode = "arabic"
prefix = arabicRegex.match(pageLabel).group("prefix")
style.update({NameObject("/S"): NameObject("/D")})
elif romanRegex.match(pageLabel):
currMode = "roman"
prefix = romanRegex.match(pageLabel).group("prefix")
style.update({NameObject("/S"): NameObject("/r")})
if currMode != lastMode or prefix != lastPrefix:
if prefix:
style.update({
NameObject("/P"): NameObject("({})".format(prefix))
})
labels.extend([
NumberObject(pageNumber),
style,
])
lastMode = currMode
lastPrefix = prefix
rootObj = fileMerger._root_object
# Todo: Fix the weird page numbering bug
pageLabels = DictionaryObject()
#fileMerger._addObject(pageLabels)
pageLabels.update({
NameObject("/Nums"): ArrayObject(labels)
})
rootObj.update({
NameObject("/PageLabels"): pageLabels
})
print("Writing PDF...")
with open("{} - {}.pdf".format(bookData['bookid'][0], bookInfo['title']).replace("/",""), "wb") as outFile:
fileMerger.write(outFile)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Missing url of eText!")
sys.exit(0)
main(sys.argv[1])
Below is the result of passing the code in the command line:
-------------------------------------------------------------
Output:
C:\zip> })
'})' is not recognized as an internal or external command,
operable program or batch file.
C:\zip> rootObj.update({
'rootObj.update' is not recognized as an internal or external command,
operable program or batch file.
C:\zip> NameObject("/PageLabels"): pageLabels
'NameObject' is not recognized as an internal or external command,
operable program or batch file.
C:\zip> })
'})' is not recognized as an internal or external command,
operable program or batch file.
C:\zip>
C:\zip> print("Writing PDF...")
Can't find file (Writing PDF...)
C:\zip> with open("{} - {}.pdf".format(bookData['bookid'][0], bookInfo['title']).replace("/",""), "wb") as outFile:
'with' is not recognized as an internal or external command,
operable program or batch file.
C:\zip> fileMerger.write(outFile)
'fileMerger.write' is not recognized as an internal or external command,
operable program or batch file.
C:\zip>
C:\zip>if __name__ == '__main__':
The syntax of the command is incorrect.
C:\zip> if len(sys.argv) < 2:
< was unexpected at this time.
C:\zip> print("Missing url of eText!")
Can't find file (Missing url of eText!)
C:\zip> sys.exit(0)
'sys.exit' is not recognized as an internal or external command,
operable program or batch file.
C:\zip> main(sys.argv[1])