The foundation of the PDF-to-Audio Reader is its ability to intelligently extract text from PDFs, optimize content for speech synthesis, and generate natural-sounding audio narration with proper pacing and pronunciation:
import PyPDF2
import streamlit as st
from gtts import gTTS
from pydub import AudioSegment
from typing import Dict, List, Optional, Tuple
import re
import io
class PDFAudioProcessor:
"""
Advanced PDF-to-audio conversion engine that extracts text intelligently,
optimizes content for speech synthesis, and generates high-quality audio narration.
"""
def __init__(self):
self.voice_settings = {
'default': {'lang': 'en', 'slow': False, 'tld': 'com'},
'british': {'lang': 'en', 'slow': False, 'tld': 'co.uk'},
'australian': {'lang': 'en', 'slow': False, 'tld': 'com.au'},
'slow': {'lang': 'en', 'slow': True, 'tld': 'com'}
}
self.text_patterns = {
'abbreviations': {
'Dr.': 'Doctor', 'Mr.': 'Mister', 'Mrs.': 'Missus', 'Ms.': 'Miss',
'Prof.': 'Professor', 'etc.': 'etcetera', 'vs.': 'versus'
},
'numbers': {
'1st': 'first', '2nd': 'second', '3rd': 'third',
'&': 'and', '@': 'at', '%': 'percent'
}
}
self.audio_settings = {
'sample_rate': 22050,
'channels': 1,
'format': 'mp3',
'bitrate': '128k',
'silence_duration': 0.5
}
def convert_pdf_to_audio(self, pdf_file, voice_type: str = 'default',
reading_speed: str = 'normal') -> Tuple[bytes, Dict]:
"""
Convert PDF document to high-quality audio narration.
Handles text extraction, optimization, and natural voice synthesis.
Args:
pdf_file: Uploaded PDF file object
voice_type: Voice accent/style selection
reading_speed: Narration speed preference
Returns:
Tuple of (audio_bytes, processing_metadata)
"""
extracted_content = self._extract_structured_text(pdf_file)
optimized_text = self._optimize_text_for_speech(extracted_content)
audio_segments = self._generate_audio_segments(optimized_text, voice_type, reading_speed)
final_audio = self._assemble_final_audiobook(audio_segments)
metadata = self._generate_audio_metadata(extracted_content, optimized_text, final_audio)
return final_audio, metadata
def _extract_structured_text(self, pdf_file) -> Dict:
"""
Extract text from PDF while preserving document structure.
Identifies chapters, sections, paragraphs for optimal audio pacing.
"""
pdf_reader = PyPDF2.PdfReader(pdf_file)
structured_content = {
'full_text': '',
'chapters': [],
'sections': [],
'page_count': len(pdf_reader.pages),
'estimated_reading_time': 0
}
full_text = ''
current_chapter = ''
chapter_count = 0
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
cleaned_text = self._clean_extracted_text(page_text)
if self._is_chapter_break(cleaned_text):
if current_chapter:
structured_content['chapters'].append({
'title': f"Chapter {chapter_count}",
'content': current_chapter,
'word_count': len(current_chapter.split())
})
chapter_count += 1
current_chapter = cleaned_text
else:
current_chapter += ' ' + cleaned_text
full_text += ' ' + cleaned_text
if current_chapter:
structured_content['chapters'].append({
'title': f"Chapter {chapter_count}",
'content': current_chapter,
'word_count': len(current_chapter.split())
})
structured_content['full_text'] = full_text.strip()
structured_content['total_words'] = len(full_text.split())
structured_content['estimated_reading_time'] = self._estimate_reading_time(full_text)
return structured_content
def _optimize_text_for_speech(self, content: Dict) -> Dict:
"""
Optimize extracted text for natural speech synthesis.
Handles pronunciation, pacing, and readability improvements.
"""
optimized_content = {
'chapters': [],
'total_segments': 0,
'processing_notes': []
}
for chapter in content['chapters']:
chapter_text = chapter['content']
expanded_text = self._expand_abbreviations(chapter_text)
normalized_text = self._normalize_numbers_and_symbols(expanded_text)
paced_text = self._add_natural_pacing(normalized_text)
speech_segments = self._create_speech_segments(paced_text)
optimized_content['chapters'].append({
'title': chapter['title'],
'segments': speech_segments,
'segment_count': len(speech_segments),
'optimization_applied': ['abbreviation_expansion', 'number_normalization', 'pacing_optimization']
})
optimized_content['total_segments'] += len(speech_segments)
return optimized_content
def _generate_audio_segments(self, optimized_content: Dict,
voice_type: str, reading_speed: str) -> List[AudioSegment]:
"""
Generate high-quality audio segments using advanced text-to-speech.
Applies voice characteristics and reading speed optimizations.
"""
voice_config = self.voice_settings.get(voice_type, self.voice_settings['default'])
if reading_speed == 'slow':
voice_config['slow'] = True
elif reading_speed == 'fast':
voice_config['slow'] = False
audio_segments = []
for chapter in optimized_content['chapters']:
title_audio = self._generate_title_audio(chapter['title'], voice_config)
audio_segments.append(title_audio)
pause = AudioSegment.silent(duration=1000)
audio_segments.append(pause)
for segment in chapter['segments']:
try:
tts = gTTS(
text=segment['text'],
lang=voice_config['lang'],
slow=voice_config['slow'],
tld=voice_config['tld']
)
audio_buffer = io.BytesIO()
tts.write_to_fp(audio_buffer)
audio_buffer.seek(0)
segment_audio = AudioSegment.from_mp3(audio_buffer)
enhanced_audio = self._enhance_audio_quality(segment_audio)
audio_segments.append(enhanced_audio)
if segment.get('pause_after', True):
pause_duration = self._calculate_pause_duration(segment)
pause = AudioSegment.silent(duration=pause_duration)
audio_segments.append(pause)
except Exception as e:
continue
return audio_segments
def _enhance_audio_quality(self, audio: AudioSegment) -> AudioSegment:
"""
Apply audio enhancements for professional audiobook quality.
Includes normalization, noise reduction, and dynamic range optimization.
"""
normalized_audio = audio.normalize()
compressed_audio = normalized_audio.compress_dynamic_range(threshold=-20.0, ratio=2.0)
enhanced_audio = compressed_audio.high_pass_filter(80).low_pass_filter(8000)
final_audio = enhanced_audio.fade_in(100).fade_out(100)
return final_audio
def _assemble_final_audiobook(self, audio_segments: List[AudioSegment]) -> bytes:
"""
Combine all audio segments into final audiobook file.
Applies professional mastering and export optimization.
"""
final_audiobook = AudioSegment.empty()
for segment in audio_segments:
final_audiobook += segment
mastered_audio = self._apply_final_mastering(final_audiobook)
audio_buffer = io.BytesIO()
mastered_audio.export(
audio_buffer,
format=self.audio_settings['format'],
bitrate=self.audio_settings['bitrate'],
parameters=["-ac", str(self.audio_settings['channels'])]
)
return audio_buffer.getvalue()