Python Forum
Parsing Oasis Open Document format.
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Parsing Oasis Open Document format.
#3
from bs4 import BeautifulSoup, SoupStrainer
import requests, re

def main():
    #request ile metin çekilir
    req = requests.get('http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415340_253892949')
    soup = BeautifulSoup(req.content,"lxml")
    # '<a href="#__RefHeading__1419338_253892949">19.905 xhtml:about</a>''

    containers = soup.find_all(['tr','td'])

    filename = "basliklar.txt"
    f = open(filename, "w")

    headers = "baslik, link\n"
    f.write(headers)

    #başlık ve ona karşılık gelen veri çekilir.
    #tag'e karşılık bir veri yok!! tag = container.nextSibling.text
    for container in containers:
        if container.nextSibling == None:
            baslik = container.text
            f.write(baslik + "\n")
        else:
            links=([link.get('href')for link in soup.find_all('a')])
            print(links)
    f.close()

if __name__ == "__main__":
    main()
Reply


Messages In This Thread
Parsing Oasis Open Document format. - by Achilles - Apr-17-2020, 01:40 AM
RE: Parsing Oasis Open Document format. - by buran - Apr-17-2020, 05:44 AM
RE: Parsing Oasis Open Document format. - by Achilles - Apr-17-2020, 01:51 PM

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020