picture_as_pdf
code
Python
verified
Free Download
devices
Cross-platform
code Code Preview
Python#!/usr/bin/env python3
"""
Email Extractor for PDF Files
Supports digital PDFs and OCR for scanned documents
"""
import re
from pathlib import Path
try:
import pdfplumber
except ImportError:
pdfplumber = None
EMAIL_PATTERN = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
def extract_from_digital_pdf(pdf_path):
"""Extract emails from text-based PDF"""
if not pdfplumber:
raise ImportError("Install pdfplumber: pip install pdfplumber")
emails = set()
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
found = re.findall(EMAIL_PATTERN, text)
emails.update(e.lower() for e in found)
return sorted(emails)
def extract_with_ocr(pdf_path):
"""Extract emails from scanned PDF using OCR"""
try:
import pytesseract
from pdf2image import convert_from_path
except ImportError:
raise ImportError("Install: pip install pytesseract pdf2image")
emails = set()
images = convert_from_path(pdf_path)
for img in images:
text = pytesseract.image_to_string(img)
found = re.findall(EMAIL_PATTERN, text)
emails.update(e.lower() for e in found)
return sorted(emails)
def extract_emails(pdf_path, use_ocr=False):
"""Main extraction function"""
if use_ocr:
return extract_with_ocr(pdf_path)
return extract_from_digital_pdf(pdf_path)
def batch_extract(folder_path, use_ocr=False):
"""Extract emails from all PDFs in folder"""
all_emails = set()
folder = Path(folder_path)
for pdf_file in folder.glob('*.pdf'):
try:
emails = extract_emails(pdf_file, use_ocr)
all_emails.update(emails)
print(f"{pdf_file.name}: {len(emails)} emails")
except Exception as e:
print(f"{pdf_file.name}: Error - {e}")
return sorted(all_emails)
if __name__ == '__main__':
import sys
pdf_path = sys.argv[1]
use_ocr = '--ocr' in sys.argv
emails = extract_emails(pdf_path, use_ocr)
print(f"Found {len(emails)} unique emails")
for email in emails:
print(email)
info About This Tool
The PDF Email Extractor parses PDF files to find email addresses. Supports both digital PDFs and scanned documents with OCR capability.
Key Features
- Digital PDFs - Fast extraction using pdfplumber (5-10 pages/sec)
- OCR Support - Tesseract OCR for scanned documents
- Multi-Page - Handles documents with 100+ pages
- Table Detection - Extracts from complex layouts
- Batch Processing - Process entire folders
- Password Support - Handles encrypted PDFs
Supported PDF Types
- Text-based PDFs (Word, Excel exports)
- Scanned documents (with OCR)
- Business cards and invoices
- Multi-column layouts
Requirements
- Python 3.7+
- pdfplumber (
pip install pdfplumber) - For OCR: pytesseract, pdf2image, Tesseract engine
Performance: Digital PDFs: 5-10 pages/sec. OCR: ~1 page/sec. Accuracy: 85-95% for clear scans.
download Download Script
Need Full Automation?
Try Postigo for automated email campaigns with AI personalization
rocket_launch Start Free Trial