Parsing Attached .MSG Files with Python3

Parsing Attached .MSG Files with Python3 - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: General Coding Help (https://python-forum.io/forum-8.html)
+--- Thread: Parsing Attached .MSG Files with Python3 (/thread-17427.html)

Parsing Attached .MSG Files with Python3 - ericl42 - Apr-10-2019

I'm trying to monitor a phishing inbox that could receive both normal emails (i.e. HTML/text based with potential attachments) as well as emails that have a .MSG file attached to it.

The goal is to have users send emails to [email protected] and once I parse out the various links (potentially malicious) as well as attachments (also potentially malicious, I'll perform some analysis on them.

The issue I'm running into is the body of the .msg file that is attached.

With the code below, I'm able to pull the to, from, subject, and all links within the original email. It also pulls down any attachments with the .msg file (i.e. on my test I was able to pull down a PDF within the .msg). However, I cannot get any of the to, from, subject, or body of the .msg file.

When I print it out as raw I get some of it in a very ugly format, but apparently with the multi-parts, I'm doing something wrong to get that piece of information.

I'm fairly new to Python so any help would be greatly appreciated.

import imaplib
import base64
import os
import email
from bs4 import BeautifulSoup

server = 'mail.server.com'
email_user = '[email protected]'
email_pass = 'XXXXXXXXXXXX'
output_dir = '/tmp/attachments/'
body = ""

def get_body(msg):
    if msg.is_multipart():
        return get_body(msg.get_payload(0))
    else:
        return msg.get_payload(None, True)

def get_attachments(msg):
    for part in msg.walk():
        if part.get_content_maintype()=='multipart':
            continue
        if part.get('Content-Disposition') is None:
            continue
        fileName = part.get_filename()

        if bool(fileName):
            filePath = os.path.join(output_dir, fileName)
            with open(filePath,'wb') as f:
                f.write(part.get_payload(decode=True))

mail = imaplib.IMAP4_SSL(server)
mail.login(email_user, email_pass)
mail.select('INBOX')

result, data = mail.search(None, 'UNSEEN')
mail_ids = data[0]
id_list = mail_ids.split()
print(id_list)

for emailid in id_list:
    result, email_data = mail.fetch(emailid, '(RFC822)')
    raw_email = email_data[0][1]
    raw_email_string = raw_email.decode('utf-8')
    email_message = email.message_from_string(raw_email_string)
    email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
    email_to = str(email.header.make_header(email.header.decode_header(email_message['To'])))
    subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
    print('From: ' + email_from)
    print('To: ' + email_to)
    print('Subject: ' + subject)
	
    get_attachments(raw_email)

    for part in email_message.walk():
        body = part.get_payload(0)
        content = body.get_payload(decode=True)
        soup = BeautifulSoup(content, 'html.parser')
        for link in soup.find_all('a'):
            print('Link: ' + link.get('href'))
        break

RE: Parsing Attached .MSG Files with Python3 - ericl42 - Apr-12-2019

I got this working with the following code. I basically had to do multiple for loops within the .msg walk and then only pull out the relevant information within the text/html sections.

for emailid in id_list:
    result, data = mail.fetch(emailid, '(RFC822)')
    raw = email.message_from_bytes(data[0][1])
    get_attachments(raw)
#    print(raw)

    header_from = mail.fetch(emailid, "(BODY[HEADER.FIELDS (FROM)])")
    header_from_str = str(header_from)
    mail_from = re.search('From:\s.+<(\S+)>', header_from_str)

    header_subject = mail.fetch(emailid, "(BODY[HEADER.FIELDS (SUBJECT)])")
    header_subject_str = str(header_subject)
    mail_subject = re.search('Subject:\s(.+)\'\)', header_subject_str)
    #mail_body = mail.fetch(emailid, "(BODY[TEXT])")
    print(mail_from.group(1))
    print(mail_subject.group(1))


    for part in raw.walk():
        if part.get_content_type() == 'message/rfc822':
            part_string = str(part)
            original_from = re.search('From:\s.+<(\S+)>\n', part_string)
            original_to = re.search('To:\s.+<(\S+)>\n', part_string)
            original_subject = re.search('Subject:\s(.+)\n', part_string)
            print(original_from.group(1))
            print(original_to.group(1))
            print(original_subject.group(1))
        if part.get_content_type() == 'text/html':
            content = part.get_payload(decode=True)
            #print(content)
            soup = BeautifulSoup(content, 'html.parser')
            for link in soup.find_all('a'):
                print('Link: ' + link.get('href'))