Python Forum

Full Version: email crawler in python
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hello. We have a Python code. This code is part of a larger code. This code first receives a URL from the user, and then searches at a depth of 2 in the URL received from the user and extracts the email addresses. The goal is to have no limits for depth and to search all subdomains and links in the received URL without any restrictions. Please guide me and give me the modified code.



def extractUrl(url):
    print ("Searching, please wait...")
    print ("This operation may take several minutes")
    try:
        count = 0

        listUrl = []

        conn = urllib.request.urlopen(url)

        html = conn.read().decode('utf-8')

        emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", html)
        print ("Searching in " + url)

        for email in emails:
            if (email not in listUrl):
                    count += 1
                    print(str(count) + " - " + email)
                    listUrl.append(email)


        soup = BeautifulSoup(html, "lxml")
        links = soup.find_all('a')

        for tag in links:
            link = tag.get('href', None)
            if link is not None:
                try:
                    print ("Searching in " + link)
                    if(link[0:4] == 'http'):
                        f = urllib.request.urlopen(link)
                        s = f.read().decode('utf-8')
                        emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", s)
                        for email in emails:
                            if (email not in listUrl):
                                count += 1
                                print(str(count) + " - " + email)
                                listUrl.append(email)
                                if(searchEmail("EmailCrawler.db", email, "Especific Search") == 0):
                                    insertEmail("EmailCrawler.db", email, "Especific Search", url)