Add extract_email.py

This commit is contained in:
git 2025-11-22 13:18:13 +00:00
commit c7ac1ea6a2

112
extract_email.py Normal file
View File

@ -0,0 +1,112 @@
#!/usr/bin/env python3
import sys
import os
import json
import email
from email import policy
from email.parser import BytesParser
from email.utils import getaddresses
def main():
if len(sys.argv) < 2:
print("Usage: extract_eml.py <file.eml>")
sys.exit(1)
eml_file = sys.argv[1]
if not os.path.isfile(eml_file):
print(f"Error: file not found: {eml_file}")
sys.exit(1)
base = os.path.splitext(eml_file)[0]
# Read the email
with open(eml_file, "rb") as f:
msg = BytesParser(policy=policy.default).parse(f)
# ---------------------------
# Extract Subject
# ---------------------------
subject = msg.get("subject", "").strip()
with open(f"{base}.subject.txt", "w", encoding="utf-8", errors="ignore") as f:
f.write(subject)
# ---------------------------
# Extract Text Body
# ---------------------------
body_text = None
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
try:
body_text = part.get_payload(decode=True).decode(errors="ignore")
except:
pass
break
else:
if msg.get_content_type() == "text/plain":
try:
body_text = msg.get_payload(decode=True).decode(errors="ignore")
except:
pass
if body_text is None:
body_text = "(No text/plain body found)"
with open(f"{base}.txt", "w", encoding="utf-8", errors="ignore") as f:
f.write(body_text)
# ---------------------------
# Extract Attachments
# ---------------------------
attachment_counter = 1
for part in msg.walk():
if part.get_content_disposition() == "attachment":
filename = part.get_filename()
if not filename:
filename = f"attachment{attachment_counter:02d}.bin"
_, ext = os.path.splitext(filename)
if ext:
outname = f"{base}.attachment{attachment_counter:02d}{ext}"
else:
outname = f"{base}.attachment{attachment_counter:02d}"
data = part.get_payload(decode=True)
with open(outname, "wb") as f:
f.write(data)
attachment_counter += 1
# ---------------------------
# Extract Addresses
# ---------------------------
from_addr = msg.get("from", "")
to_addr = msg.get_all("to", [])
cc_addr = msg.get_all("cc", [])
bcc_addr = msg.get_all("bcc", [])
# Normalize/parse multiple addresses
from_parsed = getaddresses([from_addr])
to_parsed = getaddresses(to_addr)
cc_parsed = getaddresses(cc_addr)
bcc_parsed = getaddresses(bcc_addr)
addresses_json = {
"fromAddress": from_parsed[0][1] if from_parsed else "",
"toAddresses": [a[1] for a in to_parsed],
"ccAddresses": [a[1] for a in cc_parsed],
"ccoAddresses": [a[1] for a in bcc_parsed] # BCC
}
with open(f"{base}.addresses.json", "w", encoding="utf-8") as f:
json.dump(addresses_json, f, indent=4)
print("Done.")
if __name__ == "__main__":
main()