LangChain 邮件处理指南:EML 文件解析与加密
本文将全面介绍: - EML 文件格式和结构解析 - 使用 LangChain 处理邮件内容 - 邮件加密原理和实现方法 - 实际应用场景和最佳实践 适合想要了解邮件处理和 AI 应用集成的开发者阅读。
LangChain 邮件处理指南:EML 文件解析与加密
目录
1. EML 文件基础
1.1 EML 文件结构
EML(Electronic Mail Format)是标准的电子邮件格式,基于 MIME(Multipurpose Internet Mail Extensions)标准:
From: [email protected]
To: [email protected]
Subject: Email Subject
Date: Thu, 9 Jan 2024 10:00:00 +0800
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="boundary_string"
--boundary_string
Content-Type: text/plain; charset="utf-8"
邮件正文内容
--boundary_string
Content-Type: application/pdf
Content-Disposition: attachment; filename="document.pdf"
[Base64 编码的附件内容]
--boundary_string--
1.2 MIME 类型
常见的 MIME 类型:
MIME_TYPES = {
'text/plain': '纯文本',
'text/html': 'HTML',
'multipart/mixed': '混合内容',
'multipart/alternative': '替代内容',
'application/pdf': 'PDF文件',
'image/jpeg': 'JPEG图片'
}
2. LangChain 邮件处理
2.1 环境准备
pip install langchain
pip install email-validator
pip install python-magic
pip install beautifulsoup4
2.2 基本邮件解析
from langchain.document_loaders import UnstructuredEmailLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
# 创建邮件加载器
def load_email(eml_path):
loader = UnstructuredEmailLoader(eml_path)
documents = loader.load()
return documents
# 文本分割
def split_email_content(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
splits = text_splitter.split_documents(documents)
return splits
2.3 使用 LangChain 分析邮件
# 创建邮件分析链
def create_email_analysis_chain():
prompt = PromptTemplate(
input_variables=["email_content"],
template="""
分析以下邮件内容,提取关键信息:
{email_content}
请提供:
1. 主要主题
2. 关键点
3. 需要采取的行动
4. 优先级评估
"""
)
llm = ChatOpenAI(temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)
return chain
# 完整的邮件处理流程
def process_email(eml_path):
# 加载邮件
documents = load_email(eml_path)
# 分割内容
splits = split_email_content(documents)
# 创建分析链
analysis_chain = create_email_analysis_chain()
# 分析每个部分
results = []
for split in splits:
result = analysis_chain.run(email_content=split.page_content)
results.append(result)
return results
2.4 处理邮件附件
from email import message_from_file
import base64
import os
def extract_attachments(eml_path, output_dir):
with open(eml_path, 'r') as f:
msg = message_from_file(f)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for part in msg.walk():
if part.get_content_maintype() == 'multipart':
continue
filename = part.get_filename()
if filename:
filepath = os.path.join(output_dir, filename)
with open(filepath, 'wb') as f:
f.write(part.get_payload(decode=True))
3. 邮件加密技术
3.1 S/MIME 加密
S/MIME(Secure/Multipurpose Internet Mail Extensions)是电子邮件加密的标准方式:
from cryptography import x509
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.asymmetric import padding
def encrypt_email_smime(message, recipient_cert):
# 加载接收者证书
with open(recipient_cert, 'rb') as f:
cert = x509.load_pem_x509_certificate(f.read())
# 加密邮件内容
encrypted = cert.public_key().encrypt(
message.encode(),
padding.OAEP(
mgf=padding.MGF1(algorithm=hashes.SHA256()),
algorithm=hashes.SHA256(),
label=None
)
)
return encrypted
3.2 PGP 加密
PGP(Pretty Good Privacy)是另一种常用的邮件加密方式:
from pgpy import PGPMessage, PGPKey
def encrypt_email_pgp(message, recipient_key_path):
# 加载接收者公钥
key, _ = PGPKey.from_file(recipient_key_path)
# 创建消息
pgp_message = PGPMessage.new(message)
# 加密消息
encrypted = key.encrypt(pgp_message)
return encrypted
3.3 TLS 传输加密
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
def send_encrypted_email(sender, recipient, subject, body):
msg = MIMEMultipart()
msg['From'] = sender
msg['To'] = recipient
msg['Subject'] = subject
msg.attach(MIMEText(body, 'plain'))
# 使用 TLS 加密连接
with smtplib.SMTP('smtp.gmail.com', 587) as server:
server.starttls()
server.login(sender, 'your-app-password')
server.send_message(msg)
4. 实践应用
4.1 邮件自动分类系统
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
class EmailClassifier:
def __init__(self):
self.embeddings = OpenAIEmbeddings()
self.db = Chroma(embedding_function=self.embeddings)
def classify_email(self, email_content):
# 创建嵌入
email_embedding = self.embeddings.embed_query(email_content)
# 查找最相似的类别
similar_docs = self.db.similarity_search_by_vector(
email_embedding,
k=1
)
return similar_docs[0].metadata['category']
4.2 智能邮件回复生成
def generate_email_reply(email_content):
prompt = PromptTemplate(
input_variables=["email_content"],
template="""
基于以下邮件内容生成适当的回复:
{email_content}
回复要求:
1. 保持专业性
2. 回应所有关键点
3. 明确下一步行动
"""
)
llm = ChatOpenAI(temperature=0.7)
chain = LLMChain(llm=llm, prompt=prompt)
reply = chain.run(email_content=email_content)
return reply
4.3 敏感信息检测
def detect_sensitive_info(email_content):
prompt = PromptTemplate(
input_variables=["content"],
template="""
检查以下内容是否包含敏感信息:
{content}
需要检查的信息类型:
1. 个人身份信息
2. 财务信息
3. 密码和访问凭证
4. 商业机密
"""
)
llm = ChatOpenAI(temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)
result = chain.run(content=email_content)
return result
5. 最佳实践
5.1 安全性建议
- 始终使用最新的加密标准
- 定期更新加密密钥
- 实施访问控制
- 加密存储敏感数据
- 使用安全的传输协议
5.2 性能优化
# 批量处理邮件
async def process_emails_batch(eml_files):
tasks = []
for eml_file in eml_files:
task = asyncio.create_task(
process_email_async(eml_file)
)
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
5.3 错误处理
class EmailProcessingError(Exception):
pass
def safe_process_email(eml_path):
try:
documents = load_email(eml_path)
splits = split_email_content(documents)
results = process_splits(splits)
return results
except Exception as e:
raise EmailProcessingError(f"处理邮件时出错: {str(e)}")
总结
LangChain 为邮件处理提供了强大的功能:
-
基础功能:
- 邮件解析和加载
- 内容分割和处理
- 附件处理
-
AI 增强功能:
- 内容分析和摘要
- 智能分类
- 自动回复生成
-
安全考虑:
- 加密方案选择
- 密钥管理
- 传输安全
评论