Skip to content

Commit 7a13dab

Browse files
Enhance downloader and parser modules with improved error handling and anti-hotlinking protection
- Added common file magic bytes for detecting anti-hotlink responses in downloader. - Updated content validation to return detailed error reasons for invalid TS content. - Enhanced decryption logic to log strategies and results for better debugging. - Improved M3U8 parser with logging of playlist content previews and segment counts. - Introduced additional headers in download requests to bypass anti-hotlinking measures. - Implemented checks for anti-hotlinking errors during segment downloads, with appropriate logging and error handling.
1 parent 733316c commit 7a13dab

File tree

3 files changed

+151
-34
lines changed

3 files changed

+151
-34
lines changed

m3u8-downloader/docker/worker/downloader.py

Lines changed: 85 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
TS_SYNC_BYTE = b'\x47'
2323
TS_PACKET_SIZE = 188
2424

25+
# Common file magic bytes for detecting anti-hotlink responses
26+
JPEG_MAGIC = b'\xff\xd8\xff'
27+
PNG_MAGIC = b'\x89PNG'
28+
GIF_MAGIC = b'GIF8'
29+
2530

2631
class SegmentDownloader:
2732
"""Download video segments with multi-threading and retry logic"""
@@ -56,22 +61,30 @@ def __init__(
5661
# Create output directory
5762
self.output_dir.mkdir(parents=True, exist_ok=True)
5863

59-
def _is_valid_ts_content(self, data: bytes) -> bool:
64+
def _is_valid_ts_content(self, data: bytes) -> tuple[bool, str]:
6065
"""
6166
Validate if the content is a valid MPEG-TS file.
62-
Returns False if it looks like HTML/text error page.
67+
Returns (is_valid, error_reason) tuple.
6368
"""
6469
if not data or len(data) < TS_PACKET_SIZE:
65-
return False
70+
return False, "Content too small"
71+
72+
# Check for image files (anti-hotlinking protection)
73+
if data[:3] == JPEG_MAGIC:
74+
return False, "Server returned JPEG image (anti-hotlinking protection)"
75+
if data[:4] == PNG_MAGIC:
76+
return False, "Server returned PNG image (anti-hotlinking protection)"
77+
if data[:4] == GIF_MAGIC:
78+
return False, "Server returned GIF image (anti-hotlinking protection)"
6679

6780
# Check if it starts with HTML (error page)
6881
if data[:5].lower() in (b'<!doc', b'<html', b'<?xml'):
69-
return False
82+
return False, "Server returned HTML error page"
7083

7184
# Check for common error text patterns
7285
lower_start = data[:500].lower()
7386
if b'error' in lower_start or b'forbidden' in lower_start or b'denied' in lower_start:
74-
return False
87+
return False, "Server returned error response"
7588

7689
# Check for TS sync byte at expected positions
7790
# TS packets are 188 bytes, sync byte should appear at 0, 188, 376, etc.
@@ -81,32 +94,62 @@ def _is_valid_ts_content(self, data: bytes) -> bool:
8194
sync_count += 1
8295

8396
# If we found sync bytes at expected positions, it's likely valid
84-
return sync_count >= 2
97+
if sync_count >= 2:
98+
return True, ""
99+
100+
return False, "Invalid TS format (no sync bytes found)"
85101

86102
def _decrypt_segment(self, data: bytes, segment_index: int) -> bytes:
87103
"""Decrypt AES-128 encrypted segment"""
88104
if not self.encryption_key:
89105
return data
90106

91-
try:
92-
# Use provided IV or derive from segment index
93-
if self.encryption_iv:
94-
iv = self.encryption_iv
107+
# Log key info on first segment
108+
if segment_index == 0:
109+
logger.info(f"Encryption key (first 4 bytes): {self.encryption_key[:4].hex()}")
110+
if self.encryption_iv is not None:
111+
logger.info(f"Using provided IV: {self.encryption_iv.hex()}")
95112
else:
96-
# Default IV is segment sequence number as 16-byte big-endian
97-
iv = segment_index.to_bytes(16, byteorder='big')
113+
logger.info("No IV provided, will use segment index")
114+
115+
try:
116+
# Try multiple IV strategies
117+
iv_strategies = []
98118

99-
cipher = AES.new(self.encryption_key, AES.MODE_CBC, iv)
100-
decrypted = cipher.decrypt(data)
119+
# Strategy 1: Use provided IV if specified (HLS spec compliant)
120+
if self.encryption_iv is not None:
121+
iv_strategies.append(("provided IV", self.encryption_iv))
101122

102-
# Remove PKCS7 padding
103-
try:
104-
decrypted = unpad(decrypted, AES.block_size)
105-
except ValueError:
106-
# Some streams don't use proper padding
107-
pass
123+
# Strategy 2: Use segment index as IV (common non-compliant streams)
124+
iv_strategies.append(("segment index IV", segment_index.to_bytes(16, byteorder='big')))
125+
126+
# Strategy 3: Use zeros IV if not already tried
127+
if self.encryption_iv is None or self.encryption_iv != bytes(16):
128+
iv_strategies.append(("zeros IV", bytes(16)))
129+
130+
for strategy_name, iv in iv_strategies:
131+
cipher = AES.new(self.encryption_key, AES.MODE_CBC, iv)
132+
decrypted = cipher.decrypt(data)
133+
134+
# Remove PKCS7 padding
135+
try:
136+
decrypted = unpad(decrypted, AES.block_size)
137+
except ValueError:
138+
# Some streams don't use proper padding
139+
pass
140+
141+
# Check if decryption produced valid TS data
142+
if decrypted[:1] == TS_SYNC_BYTE:
143+
if segment_index < 3: # Log first few segments
144+
logger.info(f"Segment {segment_index}: Decryption successful with {strategy_name}")
145+
return decrypted
108146

147+
# None of the strategies worked
148+
logger.warning(f"Segment {segment_index}: All decryption strategies failed (first byte after zeros IV: {hex(decrypted[0]) if decrypted else 'empty'})")
149+
150+
# Return the last decrypted result (with zeros IV) - let ffmpeg try to handle it
109151
return decrypted
152+
110153
except Exception as e:
111154
logger.warning(f"Decryption failed for segment {segment_index}: {e}")
112155
return data # Return original data if decryption fails
@@ -133,6 +176,11 @@ def download_segment(
133176
try:
134177
logger.debug(f"Downloading segment {index}: {url}")
135178

179+
# Log headers for first segment to help debug anti-hotlink issues
180+
if index == 0 and retry_count == 0:
181+
logger.info(f"Segment download headers: {self.headers}")
182+
logger.info(f"First segment URL: {url}")
183+
136184
response = self.session.get(
137185
url,
138186
headers=self.headers,
@@ -161,12 +209,23 @@ def download_segment(
161209
content = self._decrypt_segment(content, index)
162210

163211
# Validate content is actually a TS file (not an error page)
164-
if not self._is_valid_ts_content(content):
165-
# Log first 200 bytes for debugging
166-
preview = content[:200]
167-
logger.error(f"Segment {index} content is not valid TS data")
168-
logger.error(f"Content preview (first 200 bytes): {preview}")
169-
raise ValueError(f"Invalid TS content - possibly HTML error page or encrypted data")
212+
is_valid, error_reason = self._is_valid_ts_content(content)
213+
if not is_valid:
214+
# Check if this looks like encrypted data that we couldn't decrypt
215+
# In that case, still save it and let ffmpeg try to handle it
216+
skip_validation = os.environ.get('SKIP_TS_VALIDATION', 'false').lower() == 'true'
217+
218+
# Always skip validation for encrypted streams where decryption produced non-image data
219+
if self.encryption_key and not content[:3] in (JPEG_MAGIC, PNG_MAGIC[:3], GIF_MAGIC[:3]):
220+
logger.warning(f"Segment {index}: {error_reason} - saving anyway for ffmpeg to process")
221+
elif skip_validation:
222+
logger.warning(f"Segment {index}: {error_reason} - validation skipped")
223+
else:
224+
# Log first 200 bytes for debugging
225+
preview = content[:200]
226+
logger.error(f"Segment {index}: {error_reason}")
227+
logger.error(f"Content preview (first 200 bytes): {preview}")
228+
raise ValueError(error_reason)
170229

171230
# Write validated content to file
172231
with open(output_path, 'wb') as f:

m3u8-downloader/docker/worker/m3u8_parser.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,28 @@ def parse(self) -> Dict:
6464
# Fetch playlist content
6565
content = self.fetch_playlist()
6666

67+
# Log first 500 chars of content to diagnose parsing issues
68+
content_preview = content[:500] if len(content) > 500 else content
69+
logger.info(f"Playlist content preview ({len(content)} bytes):\n{content_preview}")
70+
6771
# Parse with m3u8 library
6872
playlist = m3u8.loads(content, uri=self.url)
6973

7074
# Check if this is a master playlist (with variants)
7175
if playlist.is_variant:
7276
logger.info("Master playlist detected, selecting best quality")
73-
return self._parse_master_playlist(playlist)
77+
return self._parse_master_playlist(playlist, content)
7478
else:
7579
logger.info("Media playlist detected")
76-
return self._parse_media_playlist(playlist)
80+
# Debug: log segment count before parsing
81+
logger.debug(f"Raw playlist has {len(playlist.segments)} segments, {len(playlist.playlists)} playlists")
82+
return self._parse_media_playlist(playlist, content)
7783

7884
except Exception as e:
7985
logger.error(f"Failed to parse m3u8: {e}")
8086
raise
8187

82-
def _parse_master_playlist(self, playlist: m3u8.M3U8) -> Dict:
88+
def _parse_master_playlist(self, playlist: m3u8.M3U8, content: str = None) -> Dict:
8389
"""Parse master playlist and select best quality variant"""
8490
if not playlist.playlists:
8591
raise ValueError("No variants found in master playlist")
@@ -108,13 +114,13 @@ def _parse_master_playlist(self, playlist: m3u8.M3U8) -> Dict:
108114
variant_content = variant_parser.fetch_playlist()
109115
variant_playlist = m3u8.loads(variant_content, uri=variant_url)
110116

111-
result = self._parse_media_playlist(variant_playlist)
117+
result = self._parse_media_playlist(variant_playlist, variant_content)
112118
result['resolution'] = resolution
113119
result['selected_variant_url'] = variant_url
114120

115121
return result
116122

117-
def _parse_media_playlist(self, playlist: m3u8.M3U8) -> Dict:
123+
def _parse_media_playlist(self, playlist: m3u8.M3U8, content: str = None) -> Dict:
118124
"""Parse media playlist and extract segment URLs"""
119125
segments = []
120126
total_duration = 0.0
@@ -132,6 +138,10 @@ def _parse_media_playlist(self, playlist: m3u8.M3U8) -> Dict:
132138
total_duration += segment.duration
133139

134140
if not segments:
141+
# Log the actual content for debugging
142+
if content:
143+
content_preview = content[:1000] if len(content) > 1000 else content
144+
logger.error(f"Playlist content (no segments found):\n{content_preview}")
135145
raise ValueError("No segments found in playlist")
136146

137147
logger.info(f"Found {len(segments)} segments, total duration: {total_duration:.1f}s")
@@ -165,15 +175,27 @@ def _get_encryption_info(self, playlist: m3u8.M3U8) -> Optional[Dict]:
165175
response.raise_for_status()
166176
key = response.content
167177

178+
# Validate key length (AES-128 requires 16 bytes)
179+
logger.info(f"Encryption key length: {len(key)} bytes")
180+
if len(key) != 16:
181+
logger.warning(f"Unexpected key length: {len(key)} bytes (expected 16)")
182+
# Some servers return key with extra whitespace or headers
183+
if len(key) > 16:
184+
logger.info(f"Key preview (first 32 bytes): {key[:32]}")
185+
168186
# Get IV from key info or use default
169187
iv = None
170188
if segment.key.iv:
171189
# IV is usually specified as hex string like 0x...
172190
iv_str = segment.key.iv
191+
logger.info(f"IV from m3u8: {iv_str}")
173192
if iv_str.startswith('0x') or iv_str.startswith('0X'):
174193
iv = bytes.fromhex(iv_str[2:])
175194
else:
176195
iv = bytes.fromhex(iv_str)
196+
logger.info(f"Parsed IV length: {len(iv)} bytes, value: {iv.hex()}")
197+
else:
198+
logger.info("No IV specified in m3u8, will use segment sequence number")
177199

178200
return {
179201
'method': 'AES-128',

m3u8-downloader/docker/worker/worker.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,20 @@ def _process_m3u8_download(self, job_id: str, job: dict):
318318
if 'User-Agent' not in headers:
319319
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36'
320320

321+
# Add additional browser-like headers to bypass anti-hotlinking
322+
if 'Accept' not in headers:
323+
headers['Accept'] = '*/*'
324+
if 'Accept-Language' not in headers:
325+
headers['Accept-Language'] = 'en-US,en;q=0.9'
326+
if 'Accept-Encoding' not in headers:
327+
headers['Accept-Encoding'] = 'gzip, deflate, br'
328+
if 'Sec-Fetch-Dest' not in headers:
329+
headers['Sec-Fetch-Dest'] = 'empty'
330+
if 'Sec-Fetch-Mode' not in headers:
331+
headers['Sec-Fetch-Mode'] = 'cors'
332+
if 'Sec-Fetch-Site' not in headers:
333+
headers['Sec-Fetch-Site'] = 'cross-site'
334+
321335
# Debug: Log headers to verify
322336
logger.info(f"Request headers: {headers}")
323337
if 'Cookie' in headers:
@@ -351,10 +365,22 @@ def _process_m3u8_download(self, job_id: str, job: dict):
351365
logger.info("Step 2: Downloading segments")
352366
temp_dir = tempfile.mkdtemp(prefix=f"m3u8_{job_id}_")
353367

368+
# Create segment-specific headers - some CDNs expect Referer to be the m3u8 URL
369+
segment_headers = headers.copy()
370+
m3u8_base_url = playlist_info.get('base_url', job['url'])
371+
parsed_m3u8 = urlparse(m3u8_base_url)
372+
m3u8_origin = f"{parsed_m3u8.scheme}://{parsed_m3u8.netloc}"
373+
374+
# Use m3u8 URL as Referer for segments (common anti-hotlink bypass)
375+
segment_headers['Referer'] = m3u8_base_url
376+
segment_headers['Origin'] = m3u8_origin
377+
logger.info(f"Segment Referer set to: {m3u8_base_url}")
378+
logger.info(f"Segment Origin set to: {m3u8_origin}")
379+
354380
downloader = SegmentDownloader(
355381
segments=playlist_info['segments'],
356382
output_dir=temp_dir,
357-
headers=headers,
383+
headers=segment_headers,
358384
max_workers=int(os.getenv('MAX_DOWNLOAD_WORKERS', 2)),
359385
encryption_key=playlist_info.get('encryption_key'),
360386
encryption_iv=playlist_info.get('encryption_iv')
@@ -370,9 +396,19 @@ def progress_callback(completed, total):
370396
download_progress = int(5 + (completed / total) * 80)
371397
self.update_job_status(job_id, "downloading", progress=download_progress)
372398

373-
# Check if too many segments failed with 403/474 errors during download
399+
# Check if too many segments failed during download
374400
failed_count = len(downloader.failed_segments)
375-
if failed_count > 20:
401+
if failed_count > 5:
402+
# Count anti-hotlink protection errors
403+
hotlink_count = sum(
404+
1 for item in downloader.failed_segments
405+
if 'anti-hotlinking' in item['error'].lower() or 'JPEG' in item['error'] or 'PNG' in item['error']
406+
)
407+
408+
if hotlink_count >= 5:
409+
logger.error(f"Anti-hotlinking protection detected: {hotlink_count} segments blocked")
410+
raise Exception(f"Download aborted: Server blocked segment downloads (anti-hotlinking protection). Try refreshing the source page and retrying.")
411+
376412
# Count HTTP 403/474 errors
377413
http_error_count = sum(
378414
1 for item in downloader.failed_segments

0 commit comments

Comments
 (0)