🔊 PDF-to-Audio Reader

Core Source Code & Audio Narration Implementation

Python 3.9 Streamlit Text-to-Speech Audio Processing

🔍 About This Code Showcase

This curated code snippet demonstrates how the PDF-to-Audio Reader extracts text from documents, processes content for optimal narration, and generates high-quality audio with natural voice synthesis.

Full deployment scripts, API integrations, and proprietary details are omitted for clarity and security. This showcase highlights the core text processing and audio generation algorithms.

📖 Core Algorithm: Document-to-Audio Engine

The foundation of the PDF-to-Audio Reader is its ability to intelligently extract text from PDFs, optimize content for speech synthesis, and generate natural-sounding audio narration with proper pacing and pronunciation:

📄 pdf_audio_processor.py
import PyPDF2 import streamlit as st from gtts import gTTS from pydub import AudioSegment from typing import Dict, List, Optional, Tuple import re import io class PDFAudioProcessor: """ Advanced PDF-to-audio conversion engine that extracts text intelligently, optimizes content for speech synthesis, and generates high-quality audio narration. """ def __init__(self): # Voice and language settings for natural narration self.voice_settings = { 'default': {'lang': 'en', 'slow': False, 'tld': 'com'}, 'british': {'lang': 'en', 'slow': False, 'tld': 'co.uk'}, 'australian': {'lang': 'en', 'slow': False, 'tld': 'com.au'}, 'slow': {'lang': 'en', 'slow': True, 'tld': 'com'} } # Text preprocessing patterns for better audio quality self.text_patterns = { 'abbreviations': { 'Dr.': 'Doctor', 'Mr.': 'Mister', 'Mrs.': 'Missus', 'Ms.': 'Miss', 'Prof.': 'Professor', 'etc.': 'etcetera', 'vs.': 'versus' }, 'numbers': { '1st': 'first', '2nd': 'second', '3rd': 'third', '&': 'and', '@': 'at', '%': 'percent' } } # Audio processing settings for optimal playback self.audio_settings = { 'sample_rate': 22050, 'channels': 1, # Mono for voice 'format': 'mp3', 'bitrate': '128k', 'silence_duration': 0.5 # Seconds between sections } def convert_pdf_to_audio(self, pdf_file, voice_type: str = 'default', reading_speed: str = 'normal') -> Tuple[bytes, Dict]: """ Convert PDF document to high-quality audio narration. Handles text extraction, optimization, and natural voice synthesis. Args: pdf_file: Uploaded PDF file object voice_type: Voice accent/style selection reading_speed: Narration speed preference Returns: Tuple of (audio_bytes, processing_metadata) """ # Step 1: Extract and structure text from PDF extracted_content = self._extract_structured_text(pdf_file) # Step 2: Optimize text for speech synthesis optimized_text = self._optimize_text_for_speech(extracted_content) # Step 3: Generate audio segments with proper pacing audio_segments = self._generate_audio_segments(optimized_text, voice_type, reading_speed) # Step 4: Combine segments into final audiobook final_audio = self._assemble_final_audiobook(audio_segments) # Step 5: Generate processing metadata metadata = self._generate_audio_metadata(extracted_content, optimized_text, final_audio) return final_audio, metadata def _extract_structured_text(self, pdf_file) -> Dict: """ Extract text from PDF while preserving document structure. Identifies chapters, sections, paragraphs for optimal audio pacing. """ pdf_reader = PyPDF2.PdfReader(pdf_file) structured_content = { 'full_text': '', 'chapters': [], 'sections': [], 'page_count': len(pdf_reader.pages), 'estimated_reading_time': 0 } full_text = '' current_chapter = '' chapter_count = 0 for page_num, page in enumerate(pdf_reader.pages): page_text = page.extract_text() # Clean and normalize text cleaned_text = self._clean_extracted_text(page_text) # Detect chapter breaks using common patterns if self._is_chapter_break(cleaned_text): if current_chapter: structured_content['chapters'].append({ 'title': f"Chapter {chapter_count}", 'content': current_chapter, 'word_count': len(current_chapter.split()) }) chapter_count += 1 current_chapter = cleaned_text else: current_chapter += ' ' + cleaned_text full_text += ' ' + cleaned_text # Add final chapter if current_chapter: structured_content['chapters'].append({ 'title': f"Chapter {chapter_count}", 'content': current_chapter, 'word_count': len(current_chapter.split()) }) structured_content['full_text'] = full_text.strip() structured_content['total_words'] = len(full_text.split()) structured_content['estimated_reading_time'] = self._estimate_reading_time(full_text) return structured_content def _optimize_text_for_speech(self, content: Dict) -> Dict: """ Optimize extracted text for natural speech synthesis. Handles pronunciation, pacing, and readability improvements. """ optimized_content = { 'chapters': [], 'total_segments': 0, 'processing_notes': [] } for chapter in content['chapters']: chapter_text = chapter['content'] # Step 1: Expand abbreviations for better pronunciation expanded_text = self._expand_abbreviations(chapter_text) # Step 2: Normalize numbers and symbols normalized_text = self._normalize_numbers_and_symbols(expanded_text) # Step 3: Add natural pauses and emphasis paced_text = self._add_natural_pacing(normalized_text) # Step 4: Break into optimal speech segments speech_segments = self._create_speech_segments(paced_text) optimized_content['chapters'].append({ 'title': chapter['title'], 'segments': speech_segments, 'segment_count': len(speech_segments), 'optimization_applied': ['abbreviation_expansion', 'number_normalization', 'pacing_optimization'] }) optimized_content['total_segments'] += len(speech_segments) return optimized_content def _generate_audio_segments(self, optimized_content: Dict, voice_type: str, reading_speed: str) -> List[AudioSegment]: """ Generate high-quality audio segments using advanced text-to-speech. Applies voice characteristics and reading speed optimizations. """ voice_config = self.voice_settings.get(voice_type, self.voice_settings['default']) # Adjust voice settings based on reading speed preference if reading_speed == 'slow': voice_config['slow'] = True elif reading_speed == 'fast': voice_config['slow'] = False audio_segments = [] for chapter in optimized_content['chapters']: # Add chapter title announcement title_audio = self._generate_title_audio(chapter['title'], voice_config) audio_segments.append(title_audio) # Add pause after title pause = AudioSegment.silent(duration=1000) # 1 second pause audio_segments.append(pause) # Process each text segment for segment in chapter['segments']: try: # Generate audio for text segment tts = gTTS( text=segment['text'], lang=voice_config['lang'], slow=voice_config['slow'], tld=voice_config['tld'] ) # Convert to audio segment audio_buffer = io.BytesIO() tts.write_to_fp(audio_buffer) audio_buffer.seek(0) segment_audio = AudioSegment.from_mp3(audio_buffer) # Apply audio enhancements enhanced_audio = self._enhance_audio_quality(segment_audio) audio_segments.append(enhanced_audio) # Add natural pause between segments if segment.get('pause_after', True): pause_duration = self._calculate_pause_duration(segment) pause = AudioSegment.silent(duration=pause_duration) audio_segments.append(pause) except Exception as e: # Handle TTS errors gracefully continue return audio_segments def _enhance_audio_quality(self, audio: AudioSegment) -> AudioSegment: """ Apply audio enhancements for professional audiobook quality. Includes normalization, noise reduction, and dynamic range optimization. """ # Normalize audio levels for consistent volume normalized_audio = audio.normalize() # Apply gentle compression for better dynamics compressed_audio = normalized_audio.compress_dynamic_range(threshold=-20.0, ratio=2.0) # Apply subtle EQ for voice clarity enhanced_audio = compressed_audio.high_pass_filter(80).low_pass_filter(8000) # Fade in/out for smooth transitions final_audio = enhanced_audio.fade_in(100).fade_out(100) return final_audio def _assemble_final_audiobook(self, audio_segments: List[AudioSegment]) -> bytes: """ Combine all audio segments into final audiobook file. Applies professional mastering and export optimization. """ # Combine all segments into single audio file final_audiobook = AudioSegment.empty() for segment in audio_segments: final_audiobook += segment # Apply final mastering mastered_audio = self._apply_final_mastering(final_audiobook) # Export to optimized format audio_buffer = io.BytesIO() mastered_audio.export( audio_buffer, format=self.audio_settings['format'], bitrate=self.audio_settings['bitrate'], parameters=["-ac", str(self.audio_settings['channels'])] ) return audio_buffer.getvalue()

🎵 Advanced Audio Processing Engine

The audio processing system provides professional-grade audio enhancements, including dynamic range compression, voice clarity optimization, and seamless segment transitions:

📄 audio_enhancement.py
class AudioEnhancementEngine: def _apply_final_mastering(self, audio: AudioSegment) -> AudioSegment: """ Apply professional audio mastering for optimal listening experience. Includes loudness normalization and dynamic range optimization. """ # Step 1: Normalize loudness to broadcast standards target_loudness = -23.0 # LUFS (Loudness Units relative to Full Scale) current_loudness = self._measure_loudness(audio) loudness_adjustment = target_loudness - current_loudness normalized_audio = audio + loudness_adjustment # Step 2: Apply gentle limiting to prevent clipping limited_audio = self._apply_gentle_limiter(normalized_audio, threshold=-1.0) # Step 3: Apply stereo width optimization for mono content optimized_audio = self._optimize_stereo_width(limited_audio) # Step 4: Apply final EQ for balanced frequency response final_audio = self._apply_mastering_eq(optimized_audio) return final_audio def _create_speech_segments(self, text: str) -> List[Dict]: """ Break text into optimal segments for natural speech synthesis. Considers sentence structure, paragraph breaks, and reading flow. """ segments = [] # Split text into sentences while preserving structure sentences = self._intelligent_sentence_split(text) current_segment = '' segment_word_count = 0 max_segment_words = 50 # Optimal for TTS processing for sentence in sentences: sentence_words = len(sentence.split()) # Check if adding this sentence would exceed optimal segment length if segment_word_count + sentence_words > max_segment_words and current_segment: # Complete current segment segments.append({ 'text': current_segment.strip(), 'word_count': segment_word_count, 'estimated_duration': self._estimate_speech_duration(current_segment), 'pause_after': self._should_pause_after_segment(current_segment) }) # Start new segment current_segment = sentence segment_word_count = sentence_words else: # Add to current segment if current_segment: current_segment += ' ' + sentence else: current_segment = sentence segment_word_count += sentence_words # Add final segment if current_segment: segments.append({ 'text': current_segment.strip(), 'word_count': segment_word_count, 'estimated_duration': self._estimate_speech_duration(current_segment), 'pause_after': True # Always pause after final segment }) return segments def _add_natural_pacing(self, text: str) -> str: """ Add natural pacing cues to text for more human-like narration. Inserts strategic pauses and emphasis markers. """ # Add longer pauses after paragraph breaks text = re.sub(r'\n\s*\n', '. \n\n ', text) # Add slight pauses after colons for lists or explanations text = re.sub(r':(\s+)', r':, \1', text) # Emphasize important transitional phrases emphasis_phrases = [ 'however', 'therefore', 'furthermore', 'in conclusion', 'for example', 'on the other hand', 'in contrast' ] for phrase in emphasis_phrases: # Add subtle emphasis through comma placement pattern = r'\b' + re.escape(phrase) + r'\b' replacement = f', {phrase},' text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) # Improve number reading by adding spaces text = re.sub(r'(\d{4,})', lambda m: ' '.join(m.group(1)[i:i+3] for i in range(0, len(m.group(1)), 3)), text) return text def _expand_abbreviations(self, text: str) -> str: """ Expand abbreviations and acronyms for proper pronunciation. Handles common abbreviations and context-specific expansions. """ expanded_text = text # Expand common abbreviations for abbrev, expansion in self.text_patterns['abbreviations'].items(): expanded_text = expanded_text.replace(abbrev, expansion) # Handle time abbreviations expanded_text = re.sub(r'\b(\d{1,2}):(\d{2})\s*(AM|PM)\b', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", expanded_text, flags=re.IGNORECASE) # Handle date abbreviations month_abbreviations = { 'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April', 'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August', 'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December' } for abbrev, full_name in month_abbreviations.items(): expanded_text = re.sub(r'\b' + abbrev + r'\.?\b', full_name, expanded_text) return expanded_text def _estimate_speech_duration(self, text: str) -> float: """ Estimate speech duration for text segment. Based on average speaking rates and text complexity. """ word_count = len(text.split()) # Average words per minute for natural speech base_wpm = 150 # Adjust for text complexity complexity_factor = self._calculate_text_complexity(text) adjusted_wpm = base_wpm * complexity_factor # Calculate duration in seconds duration_minutes = word_count / adjusted_wpm duration_seconds = duration_minutes * 60 return duration_seconds def _calculate_text_complexity(self, text: str) -> float: """ Calculate text complexity factor for speech duration estimation. Considers factors like punctuation density, word length, and technical terms. """ # Count complex factors word_count = len(text.split()) long_words = len([word for word in text.split() if len(word) > 6]) punctuation_count = len(re.findall(r'[,.;:!?]', text)) # Calculate complexity factors long_word_ratio = long_words / word_count if word_count > 0 else 0 punctuation_density = punctuation_count / word_count if word_count > 0 else 0 # Calculate adjustment factor (lower = slower speech needed) complexity_factor = 1.0 - (long_word_ratio * 0.3) - (punctuation_density * 0.2) # Ensure factor stays within reasonable bounds return max(0.7, min(1.0, complexity_factor))

⚙️ Technical Implementation Notes

Key Algorithms & Innovations

Why This Approach Works