From c7ac1ea6a2ceff6c65c9ebdaaf53d7d79cef1fbf Mon Sep 17 00:00:00 2001 From: git Date: Sat, 22 Nov 2025 13:18:13 +0000 Subject: [PATCH] Add extract_email.py --- extract_email.py | 112 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 extract_email.py diff --git a/extract_email.py b/extract_email.py new file mode 100644 index 0000000..465b444 --- /dev/null +++ b/extract_email.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +import sys +import os +import json +import email +from email import policy +from email.parser import BytesParser +from email.utils import getaddresses + +def main(): + if len(sys.argv) < 2: + print("Usage: extract_eml.py ") + sys.exit(1) + + eml_file = sys.argv[1] + + if not os.path.isfile(eml_file): + print(f"Error: file not found: {eml_file}") + sys.exit(1) + + base = os.path.splitext(eml_file)[0] + + # Read the email + with open(eml_file, "rb") as f: + msg = BytesParser(policy=policy.default).parse(f) + + # --------------------------- + # Extract Subject + # --------------------------- + subject = msg.get("subject", "").strip() + + with open(f"{base}.subject.txt", "w", encoding="utf-8", errors="ignore") as f: + f.write(subject) + + # --------------------------- + # Extract Text Body + # --------------------------- + body_text = None + + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + try: + body_text = part.get_payload(decode=True).decode(errors="ignore") + except: + pass + break + else: + if msg.get_content_type() == "text/plain": + try: + body_text = msg.get_payload(decode=True).decode(errors="ignore") + except: + pass + + if body_text is None: + body_text = "(No text/plain body found)" + + with open(f"{base}.txt", "w", encoding="utf-8", errors="ignore") as f: + f.write(body_text) + + # --------------------------- + # Extract Attachments + # --------------------------- + attachment_counter = 1 + + for part in msg.walk(): + if part.get_content_disposition() == "attachment": + filename = part.get_filename() + if not filename: + filename = f"attachment{attachment_counter:02d}.bin" + + _, ext = os.path.splitext(filename) + if ext: + outname = f"{base}.attachment{attachment_counter:02d}{ext}" + else: + outname = f"{base}.attachment{attachment_counter:02d}" + + data = part.get_payload(decode=True) + + with open(outname, "wb") as f: + f.write(data) + + attachment_counter += 1 + + # --------------------------- + # Extract Addresses + # --------------------------- + from_addr = msg.get("from", "") + to_addr = msg.get_all("to", []) + cc_addr = msg.get_all("cc", []) + bcc_addr = msg.get_all("bcc", []) + + # Normalize/parse multiple addresses + from_parsed = getaddresses([from_addr]) + to_parsed = getaddresses(to_addr) + cc_parsed = getaddresses(cc_addr) + bcc_parsed = getaddresses(bcc_addr) + + addresses_json = { + "fromAddress": from_parsed[0][1] if from_parsed else "", + "toAddresses": [a[1] for a in to_parsed], + "ccAddresses": [a[1] for a in cc_parsed], + "ccoAddresses": [a[1] for a in bcc_parsed] # BCC + } + + with open(f"{base}.addresses.json", "w", encoding="utf-8") as f: + json.dump(addresses_json, f, indent=4) + + print("Done.") + +if __name__ == "__main__": + main()