A modified version which can filter unique e-mail addresses and keeping the order of first occurrence.
In addition, you can sort.
#!/usr/bin/env python3
"""
Extracts email addresses from one or more plain text files.
"""
import re
from argparse import ArgumentParser
regex = re.compile(
"([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"
)
def get_emails(text):
"""Returns an iterator of matched emails found in string text."""
# Removing lines that start with '//' because the regular expression
# mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'.
return (email[0] for email in regex.findall(text) if not email[0].startswith("//"))
def sort_by_tld(email):
user, tld = email.split("@", maxsplit=1)
return (tld, user)
def main(files, unique, sort):
emails = []
for file in files:
with open(file) as fd:
emails.extend(get_emails(fd.read().lower()))
if unique:
emails = list(dict.fromkeys(emails))
if sort:
emails = sorted(emails, key=sort_by_tld)
for email in emails:
print(email)
def get_args():
parser = ArgumentParser(description=__doc__)
parser.add_argument("files", nargs="+", help="files to parse for e-mail addresses")
parser.add_argument("-u", action="store_true", help="only unique emails")
parser.add_argument("-s", action="store_true", help="sort result")
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
main(args.files, args.u, args.s)
Then you call the program:
python3 get_mails.py *.txt -u -s > emails.txt
The shell replaces the
*.txt
with matching filenames.
To understand this behavior, you can make a small program for testing:
import sys
print(sys.argv)
Then execute the program. I have given it the name args_print.py:
python args_print.py a b c d *
The * is replaced with files in current working directory. Hidden files which start with a dot are excluded.
PS: The Windows PowerShell and Terminal does not have this behavior. In this case * is not replaced with matching files and you get instead the * as argument.