diff --git a/.auxiliary/notes/confidence.md b/.auxiliary/notes/confidence.md new file mode 100644 index 0000000..9cde9a2 --- /dev/null +++ b/.auxiliary/notes/confidence.md @@ -0,0 +1,307 @@ +# Confidence Scoring Strategy + +## Overview + +This document describes the confidence scoring strategy for detection results in detextive. The core principle is that confidence should reflect **both detection quality AND sample size adequacy**. + +## Design Philosophy + +### Why Scale All Confidence by Content Size? + +1. **Small samples are inherently less reliable**: A charset detection on 10 bytes is fundamentally less trustworthy than the same detection on 1000 bytes, regardless of what the detector reports. + +2. **Empirical justification**: `chardet` is known to be overconfident on small samples, sometimes reporting high confidence on minimal data that could be interpreted multiple ways. + +3. **Cost-benefit alignment**: Trial decoding and validation are **cheaper** for small content. Being more conservative (lower confidence → more validation) when it matters least (small files) is a win-win. + +4. **Smooth, predictable behavior**: Linear scaling avoids arbitrary threshold discontinuities. A step function would create sudden behavior changes at threshold boundaries, while linear scaling provides gradual, intuitive confidence progression. + +5. **Philosophical consistency**: "Honest about limitations" means acknowledging that charset/MIME detection is fundamentally harder with less data. Our confidence scores should reflect this reality. + +## Size Scaling Formula + +```python +def confidence_from_bytes_quantity( + content: Content, behaviors: Behaviors = BEHAVIORS_DEFAULT +) -> float: + return min(1.0, len(content) / behaviors.bytes_quantity_confidence_divisor) +``` + +**Default divisor**: 1024 bytes + +This means: +- 512 bytes → 0.5 scaling factor +- 1024 bytes → 1.0 scaling factor (full confidence) +- 2048 bytes → 1.0 (capped at maximum) + +## Detector-Specific Strategies + +### Detectors With Intrinsic Confidence + +These detectors provide their own confidence scores based on detection quality. We multiply by the size scaling factor. + +#### chardet (Charset Detection) + +```python +def _detect_via_chardet( + content: Content, behaviors: Behaviors +) -> CharsetResult | types.NotImplementedType: + try: import chardet + except ImportError: return NotImplemented + result_ = chardet.detect(content) + charset, confidence = result_['encoding'], result_['confidence'] + + # Scale confidence by content size + size_factor = confidence_from_bytes_quantity(content, behaviors=behaviors) + confidence = confidence * size_factor + + return CharsetResult(charset=charset, confidence=confidence) +``` + +**Rationale**: `chardet` reports confidence based on statistical analysis, but doesn't account for sample size adequacy. A 95% confidence on 10 bytes should be treated much more skeptically than 95% on 1000 bytes. + +#### puremagic (MIME Type Detection) + +```python +def _detect_via_puremagic( + content: Content, behaviors: Behaviors +) -> MimetypeResult | types.NotImplementedType: + try: import puremagic + except ImportError: return NotImplemented + try: + matches = puremagic.magic_string(content) + if not matches: return NotImplemented + match = matches[0] # Best match + + # Use puremagic's intrinsic confidence, scaled by size + size_factor = confidence_from_bytes_quantity(content, behaviors=behaviors) + confidence = match.confidence * size_factor + + return MimetypeResult(mimetype=match.mime_type, confidence=confidence) + except (puremagic.PureError, ValueError): + return NotImplemented +``` + +**Rationale**: `puremagic` provides confidence scores (typically 0.4-0.8) based on signature match quality. Longer, more specific signatures get higher confidence. Similar to `chardet`, these scores benefit from size scaling. + +**Note**: The current implementation uses `puremagic.from_string(content, mime=True)` which returns a simple string. To access confidence, we need to use `puremagic.magic_string(content)` instead, which returns `PureMagicWithConfidence` objects. + +### Detectors Without Intrinsic Confidence + +These detectors only return a detection result without quality assessment. We assign a base confidence constant, then scale by size. + +#### magic/libmagic (MIME Type Detection) + +```python +def _detect_via_magic( + content: Content, behaviors: Behaviors +) -> MimetypeResult | types.NotImplementedType: + try: import magic + except ImportError: return NotImplemented + try: mimetype = magic.from_buffer(content, mime=True) + except Exception: return NotImplemented + + # Use different base confidence for textual vs binary formats + if is_textual_mimetype(mimetype): + BASE_CONFIDENCE = 0.75 # Lower for text (heuristic-based) + else: + BASE_CONFIDENCE = 0.95 # Higher for binary (magic bytes) + + confidence = BASE_CONFIDENCE * confidence_from_bytes_quantity( + content, behaviors=behaviors) + return MimetypeResult(mimetype=mimetype, confidence=confidence) +``` + +**Rationale**: +- **Binary formats (0.95)**: libmagic excels at detecting structured binary formats with magic bytes (PNG: `\x89PNG`, PDF: `%PDF`, etc.). These are unambiguous byte patterns with decades of curated signatures. +- **Textual formats (0.75)**: Text detection is often heuristic-based. `text/plain` is frequently a fallback/guess. `text/html`, `text/xml`, and even `application/json` (which may be detected as `text/plain` on some platforms) are more ambiguous and context-dependent. + +**Platform note**: `magic` behavior varies across platforms and versions. On Windows, JSON content may return `text/plain` instead of `application/json`. The textual/non-textual distinction handles this gracefully. + +#### charset-normalizer (Charset Detection) + +```python +def _detect_via_charset_normalizer( + content: Content, behaviors: Behaviors +) -> CharsetResult | types.NotImplementedType: + try: import charset_normalizer + except ImportError: return NotImplemented + result_ = charset_normalizer.from_bytes(content).best() + charset = None if result_ is None else result_.encoding + + # charset-normalizer doesn't provide usable confidence + # Use base constant scaled by size + BASE_CONFIDENCE = 0.85 + confidence = BASE_CONFIDENCE * confidence_from_bytes_quantity( + content, behaviors=behaviors) + + return CharsetResult(charset=charset, confidence=confidence) +``` + +**Rationale**: +- charset-normalizer has `coherence` and related attributes, but these are not reliable confidence metrics (often 0.0) +- Evaluation results showed: 92% accurate on UTF-8, but only 17% on Latin-1/CP1252 +- Base confidence of 0.85 reflects that it's good but not as reliable as `chardet` (which provides its own confidence) +- Still higher than textual MIME detection (0.75) since charset detection is more targeted + +## Confidence Constants Summary + +| Detector | Type | Strategy | Base Confidence | Notes | +|----------|------|----------|-----------------|-------| +| `chardet` | Charset | Intrinsic × size | N/A (uses reported) | Statistical analysis | +| `puremagic` | MIME | Intrinsic × size | N/A (uses reported) | Signature match quality (0.4-0.8) | +| `magic` (binary) | MIME | Constant × size | 0.95 | Magic bytes, very reliable | +| `magic` (textual) | MIME | Constant × size | 0.75 | Heuristic-based, less reliable | +| `charset-normalizer` | Charset | Constant × size | 0.85 | Good for UTF-8, weaker for legacy | + +## Example Confidence Calculations + +### Small File (100 bytes) +Size factor: `100 / 1024 = 0.0977` (~0.1) + +- **chardet** (0.95 raw): `0.95 × 0.1 = 0.095` +- **magic** binary (0.95 base): `0.95 × 0.1 = 0.095` +- **magic** textual (0.75 base): `0.75 × 0.1 = 0.075` +- **charset-normalizer** (0.85 base): `0.85 × 0.1 = 0.085` +- **puremagic** (0.8 raw): `0.8 × 0.1 = 0.08` + +All appropriately conservative. With `trial_decode_confidence = 0.80`, all trigger validation. + +### Medium File (512 bytes) +Size factor: `512 / 1024 = 0.5` + +- **chardet** (0.95 raw): `0.95 × 0.5 = 0.475` +- **magic** binary (0.95 base): `0.95 × 0.5 = 0.475` +- **magic** textual (0.75 base): `0.75 × 0.5 = 0.375` +- **charset-normalizer** (0.85 base): `0.85 × 0.5 = 0.425` +- **puremagic** (0.8 raw): `0.8 × 0.5 = 0.4` + +Still below 0.80 threshold, but closer. More validation occurs. + +### Full Confidence (1024+ bytes) +Size factor: `1024 / 1024 = 1.0` (or higher, capped at 1.0) + +- **chardet** (0.95 raw): `0.95 × 1.0 = 0.95` +- **magic** binary (0.95 base): `0.95 × 1.0 = 0.95` +- **magic** textual (0.75 base): `0.75 × 1.0 = 0.75` +- **charset-normalizer** (0.85 base): `0.85 × 1.0 = 0.85` +- **puremagic** (0.8 raw): `0.8 × 1.0 = 0.8` + +Nice spread. Binary detections and high-confidence chardet skip validation. Textual MIME and charset-normalizer still trigger validation unless detection is very confident or sample is larger. + +## Interaction with Behavior Thresholds + +### `trial_decode_confidence` (default: 0.80) + +Minimum confidence to skip trial decoding during charset detection. + +With size scaling: +- Small files almost always trigger trial decode (good: cheap to validate) +- Medium files trigger if detector isn't confident +- Large files only skip if detector is confident + +### `text_validate_confidence` (default: 0.80) + +Minimum confidence to skip text validation. + +Similar behavior: more validation on small samples, less on large confident detections. + +## Special Cases + +### Empty Content + +Empty content (`b''`) always returns: +- Charset: default charset with 1.0 confidence +- MIME: `text/plain` with 1.0 confidence + +No detection is needed, so confidence is absolute. + +### Content with BOM + +BOMs (Byte Order Marks) provide near-certainty for UTF-8/UTF-16 detection regardless of size. However: +- This is already handled in `_normalize_charset_detection()` which checks for BOM and adjusts charset accordingly +- No special confidence handling needed; chardet will report high confidence, which is appropriate + +### Pure ASCII + +Small pure ASCII samples (like `b"Hello"`) get scaled down confidence, but: +- ASCII is promoted to UTF-8 via `charset_promotions` +- Small ASCII content is cheap to validate +- Erring on the side of validation is fine + +## Implementation Notes + +### Current State (Before Changes) + +- ✅ `chardet`: Returns raw confidence (no scaling) +- ✅ `charset-normalizer`: Uses `confidence_from_bytes_quantity()` +- ✅ `magic`: Uses `confidence_from_bytes_quantity()` +- ✅ `puremagic`: Uses `confidence_from_bytes_quantity()` + +### Needed Changes + +1. **Scale chardet confidence**: Multiply by size factor +2. **Add base constants**: Define base confidence for `magic` and `charset-normalizer` +3. **Textual/binary distinction for magic**: Use `is_textual_mimetype()` to select base confidence +4. **Extract puremagic confidence**: Switch from `from_string()` to `magic_string()` to access confidence scores + +## Documentation for Users + +Users should understand that confidence scores in detextive are composite: + +> **Confidence scores reflect both detection quality and sample adequacy.** +> +> A confidence of 0.95 from detextive means both: +> - The detector is highly confident in its result +> - There is sufficient content for reliable detection +> +> For small samples (< 1024 bytes), confidence is proportionally reduced to encourage validation. This is intentional: charset and MIME type detection are fundamentally less reliable with less data. + +## Rationale: Why Not Step Functions? + +An alternative approach would be minimum size thresholds: + +```python +if len(content) < 1024: + confidence = min(confidence, 0.79) # Force below threshold +``` + +**Problems with this approach**: +1. **Discontinuous behavior**: 1023 bytes → untrusted, 1024 bytes → fully trusted +2. **Arbitrary boundary**: Why 1024? Why not 512 or 2048? +3. **Loss of information**: A 1000-byte detection is more reliable than a 100-byte detection, but both get capped + +Linear scaling is more principled, predictable, and preserves relative quality differences across sample sizes. + +## Future Considerations + +### Tunable Parameters + +If users want different size/confidence tradeoffs, they can adjust: + +```python +behaviors = Behaviors( + bytes_quantity_confidence_divisor=512, # Smaller threshold + trial_decode_confidence=0.70, # Lower bar for skipping validation +) +``` + +### Alternative Scaling Functions + +Linear scaling is simple and effective, but alternatives could be considered: + +```python +# Logarithmic (slower growth, more conservative) +confidence = math.log(len(content) + 1) / math.log(1025) + +# Sigmoid (smooth S-curve with inflection point) +confidence = 1 / (1 + math.exp(-k * (len(content) - midpoint))) +``` + +For now, linear scaling aligns with the design philosophy: simple, honest, and predictable. + +## Related Documents + +- `.auxiliary/notes/decode-refactor.md` - Context-based trial order and design philosophy +- `.auxiliary/notes/charset-detector-evaluation-results.md` - Empirical detector performance data +- `documentation/architecture/designs/001-python-api.rst` - API design including confidence scoring diff --git a/.auxiliary/notes/decode-refactor.md b/.auxiliary/notes/decode-refactor.md index 4cbb8e8..e06ab8b 100644 --- a/.auxiliary/notes/decode-refactor.md +++ b/.auxiliary/notes/decode-refactor.md @@ -4,359 +4,185 @@ The current `decode()` implementation has become overly complex with multiple special cases, three different `trial_codecs` usage patterns, and platform-specific encoding issues. The Windows Python 3.11+ doctest failures revealed fundamental issues with how we handle charset detection and validation. -## Core Insight: 8-bit Charsets Are Uninformative +## Core Insight: Charset Detection is Fundamentally Hard -**Key realization**: 8-bit character sets (cp1252, iso-8859-*, etc.) accept any byte sequence because they have one-to-one correspondence between byte values and code points. Trial decodes with these charsets tell us nothing about correctness. +**Key realization**: Without context, charset detection is heuristics all the way down. No amount of algorithmic complexity can solve the fundamental ambiguity problem. -Only **7-bit** (ASCII) and **multi-byte** (UTF-8, Shift-JIS, etc.) charsets provide informative feedback through decode success/failure. +**Examples of inherent ambiguity:** +- UTF-8 Turkish text decoded as ISO-8859-9 produces valid-looking mojibake +- ISO-8859-9 Turkish text decoded as UTF-8 also produces mojibake +- Both are "valid" decodings with different results +- Without external context (user knowledge, file source, HTTP headers), detection is guessing -## Design Principles +## Design Philosophy: Simplicity + User Control -1. **Ignore MIME type in `decode()`** - Focus solely on getting correct text -2. **Consider confidence for non-8-bit detections** - Even multi-byte charsets can be misdetected; 7-bit (ASCII) especially unreliable -3. **Distrust 8-bit detections** - They always succeed but may produce mojibake -4. **Respect configurable validation behavior** - Honor existing `text_validate` settings -5. **Shortest string wins for multi-byte** - Mojibake produces longer strings -6. **User supplement gets priority among 8-bit** - Respect user knowledge +After extensive analysis of multi-tier categorization schemes (permissive vs restrictive, multi-byte vs single-byte, etc.), we conclude: -## New Architecture +**Better to be simple and honest about limitations than complex and pretending to solve the unsolvable.** -### Helper Function: `is_permissive_charset()` +### What We Learned -```python -# Module-level cache (always on) -_PERMISSIVE_CHARSET_CACHE: dict[str, bool] = {} +1. **CP1252 is not fully permissive**: Has 5 undefined bytes (0x81, 0x8d, 0x8f, 0x90, 0x9d) +2. **ISO-8859-* variants are fully permissive**: All 256 bytes decode (many variants exist for different languages) +3. **ASCII compatibility is universal**: All major 8-bit encodings preserve ASCII in bytes 0x00-0x7F +4. **UTF-8 vs CP1252 length heuristic works**: UTF-8 multi-byte always produces shorter strings than 8-bit misinterpretation +5. **But length heuristic fails for other encodings**: Turkish ISO-8859-9 vs UTF-8 can produce same-length mojibake + +### Implementation Findings +`is_permissive_charset()` successfully implemented: +```python def is_permissive_charset(charset: str) -> bool: - """Check if charset accepts all byte sequences (8-bit encoding). - - Returns True for: cp1252, iso-8859-*, koi8-r, etc. - Returns False for: utf-8, ascii, shift-jis, etc. - - Tests both ascending and descending byte sequences to detect - multi-byte sequence introducers, and checks decoded length - to ensure 1:1 byte-to-character mapping. - """ - # Normalize and check cache - charset_normalized = normalize_charset(charset) - if charset_normalized in _PERMISSIVE_CHARSET_CACHE: - return _PERMISSIVE_CHARSET_CACHE[charset_normalized] - - try: - # Test ascending sequence - ascending = bytes(range(256)) - text_asc = ascending.decode(charset, errors='strict') - - # Test descending sequence (catches multi-byte introducers) - descending = bytes(range(255, -1, -1)) - text_desc = descending.decode(charset, errors='strict') - - # Check lengths: must be exactly 256 chars (1:1 mapping) - is_permissive = (len(text_asc) == 256 and len(text_desc) == 256) - - _PERMISSIVE_CHARSET_CACHE[charset_normalized] = is_permissive - return is_permissive - - except (UnicodeDecodeError, LookupError): - # Some bytes failed → informative charset - _PERMISSIVE_CHARSET_CACHE[charset_normalized] = False - return False + """Check if charset accepts all 256 byte values.""" + # Test ascending and descending sequences + # Check length == 256 (1:1 byte-to-char mapping) + # Cache results ``` -**Implementation notes:** -- Cache always enabled (minimal memory footprint) -- Tests both ascending and descending byte sequences -- Checks decoded length to detect multi-byte encodings -- Handles unknown/future charsets automatically +Results: +- ✅ ISO-8859-1: `True` (fully permissive) +- ✅ CP1252: `False` (5 undefined bytes) +- ✅ ASCII: `False` (only 128 values) +- ✅ UTF-8: `False` (multi-byte sequences) -### New Function: `detect_charset_reliable()` +But this revealed new complexity: need to subcategorize "restrictive" into multi-byte vs single-byte to avoid CP1252 mojibake before UTF-8 attempts. -Wrapper around `detect_charset_confidence()` that validates suspicious detections via trial decode: +**This led to a design rabbit hole that misses the forest for the trees.** -```python -def detect_charset_reliable(content, ...): - """Detect charset with validation of suspicious results. - - Part of public API. Applications can use this for more reliable - detection than raw detect_charset(). - """ - result = detect_charset_confidence(content, ...) - detected, confidence = result.charset, result.confidence - - # Consider confidence, especially for 7-bit and multi-byte - # Even non-8-bit charsets can be misdetected - if not is_permissive_charset(detected): - # If confidence is high enough, trust it - # Reuse existing threshold from behaviors DTO - if confidence >= behaviors.charset_confidence_threshold: - return result - # Otherwise, try defaults as well - - # Detected is 8-bit or low-confidence, try defaults - python_default = sys.getdefaultencoding() # utf-8 - os_default = discover_os_charset_default() # varies - - for default in [python_default, os_default]: - if not is_permissive_charset(default): - try: - content.decode(default) - # Return with appropriate confidence - return CharsetResult(charset=default, confidence=...) - except UnicodeDecodeError: - continue - - # All informative charsets failed, return original detection - return result -``` +## Simplified Design (Current Direction) -**Note**: Also add `detect_charset_confidence_reliable()` variant that returns full result object. +### Principles -### Helper Function: `_decode_with_http_content_type()` +1. **Put user in control**: Provide supplement as `str` or codec specifier +2. **Use sensible defaults**: OS charset for local files, Python charset (usually UTF-8) for general use +3. **Trust high-confidence detection**: But allow it to be overridden by user/context +4. **Keep it simple**: Fewer tiers, clearer behavior, easier to reason about -Extract HTTP Content-Type handling into helper: +### Trial Order Strategy ```python -def _decode_with_http_content_type( - content, http_content_type, behaviors, profile, location -): - """Attempt decode with charset from HTTP Content-Type header. - - Returns decoded text if successful, None if should fall back to detection. - Always falls back (never raises) on failure. - """ - charset = parse_charset_from_content_type(http_content_type) - if not charset or is_absent(charset): - return None - - # Use existing trial decode helpers - try: - text, result = attempt_decodes( - content, - behaviors=behaviors, - inference=charset, - location=location - ) - # Validate if configured - if should_validate_text(behaviors, result.confidence): - if not profile(text): - return None # Fall back - return text - except ContentDecodeFailure: - return None # Fall back +trial_order = [ + UserSupplement, # User knows their data (highest priority) + OsDefault, # Sensible for local filesystem content + PythonDefault, # Usually UTF-8, can be set via PYTHONIOENCODING +] + +# Insert detected charset based on confidence: +if detection.confidence >= behaviors.trial_decode_confidence: + trial_order.insert(1, FromInference) # After user, before OS +else: + trial_order.append(FromInference) # At end (suspicious) ``` -### Refactored `decode()` Flow +### User Supplement Enhancement +Allow `charset_supplement` to be either: +- **`str`**: Specific charset name (e.g., `'utf-8'`, `'iso-8859-9'`) +- **Codec specifier**: `OsDefault`, `PythonDefault`, etc. + +**Use cases:** ```python -def decode(content, http_content_type=None, charset_supplement=None, - behaviors=..., profile=..., location=...): - """Decode bytes to text with intelligent charset selection.""" - - if content == b'': - return '' - - # 1. Try authoritative charset from HTTP Content-Type - if http_content_type: - text = _decode_with_http_content_type( - content, http_content_type, behaviors, profile, location) - if text is not None: - return text - # Fall back to detection - - # 2. Detect charset with validation - result = detect_charset_confidence_reliable( - content, behaviors=behaviors, supplement=charset_supplement) - detected = result.charset - - # 3. Build candidate lists - reuse existing trial decode helpers - # Use attempt_decodes() and related functions rather than - # reinventing the wheel - - trial_candidates = [] # Non-8-bit charsets - actual_candidates = [] # 8-bit charsets - - # Add detected - if not is_permissive_charset(detected): - trial_candidates.append(detected) - else: - actual_candidates.append(detected) - - # Add defaults if different from detected and non-8-bit - python_default = sys.getdefaultencoding() # utf-8 - os_default = discover_os_charset_default() # varies - - for default in [python_default, os_default]: - if (default not in trial_candidates - and default not in actual_candidates - and not is_permissive_charset(default)): - trial_candidates.append(default) - - # Add supplement - if not is_absent(charset_supplement): - if is_permissive_charset(charset_supplement): - actual_candidates.insert(0, charset_supplement) - else: - trial_candidates.append(charset_supplement) - - # 4. Try candidates using existing helpers - # Validation timing respects behaviors.text_validate configuration - text = _try_decode_candidates( - content, trial_candidates, actual_candidates, - behaviors, profile, location) - - if text is not None: - return text - - # 5. No valid decode found - raise ContentDecodeFailure(location=location) -``` +# Internet/web content - prefer UTF-8 +decode(content, charset_supplement='utf-8') -**Implementation notes:** -- Reuse existing `attempt_decodes()` and codec trial functions -- Respect `behaviors.text_validate` configuration (Never/AsNeeded/Always) -- Extract helpers to avoid monolithic decode function +# Local filesystem - use OS charset +decode(content, charset_supplement=OsDefault) -### Decision Priority +# Known legacy encoding +decode(content, charset_supplement='iso-8859-9') +``` -When multiple decodes succeed: +### Optional: Use `is_permissive_charset()` for Filtering -1. **Shortest string always wins** (less mojibake) -2. **Tie-breaker**: User supplement over other charsets (user knowledge) -3. **Secondary tie-breaker**: Non-8-bit over 8-bit (more informative) +One lightweight use of the permissive check: -**Implementation**: ```python -def _try_decode_candidates(...): - results = [] - - # Try all candidates and collect successful decodes - for charset in all_candidates: - try: - text = content.decode(charset) - if should_validate and not profile(text): - continue - results.append(( - len(text), # Primary: shortest - charset != charset_supplement, # Tie-break: supplement wins - is_permissive_charset(charset), # Secondary: non-8-bit wins - charset, - text - )) - except UnicodeDecodeError: - continue - - if results: - # Sort by tuple: shortest, then supplement, then non-8-bit - results.sort() - return results[0][4] # Return text - - return None +# Skip truly permissive charsets if non-permissive options exist +candidates = build_candidate_list() +non_permissive = [c for c in candidates if not is_permissive_charset(c)] +if non_permissive: + candidates = non_permissive # Prefer informative attempts ``` -### Validation Timing +This prevents trying ISO-8859-1 when UTF-8 is available, without complex multi-tier logic. -Text validation timing is **configurable** via `behaviors.text_validate`: -- **Never**: Skip validation entirely -- **AsNeeded**: Validate based on confidence threshold -- **Always**: Always validate +## Current Implementation Status -The existing behavior configuration is preserved. Validation can happen during candidate selection or after - the difference is minimal in practice since validation is already configurable. +### Implemented ✅ -## OS Default vs Python Default +1. **`is_permissive_charset()`** - Working perfectly with caching +2. **HTTP Content-Type handling** - Extracts and validates charset, falls back gracefully +3. **Separate permissive/restrictive lists** - In `_attempt_decodes()` +4. **BOM handling** - `remove_bom` behavior parameter +5. **Charset deduplication** - Normalized before adding to trial list +6. **Empty content uses default** - Not hardcoded to UTF-8 -- **Python default**: `sys.getdefaultencoding()` → always UTF-8 in Python 3 - - Can be overridden via `PYTHONIOENCODING` or CLI flag -- **OS default**: `locale.getencoding()` (3.11+) or `sys.getfilesystemencoding()` - - cp1252 on Windows, UTF-8 on modern Linux/Mac +### Issues Discovered 🔍 -**Strategy**: Try both when they differ, preferring Python default first. +1. **Complexity creep**: Permissive vs restrictive revealed need for multi-byte vs single-byte subcategorization +2. **CP1252 vs UTF-8 ordering**: CP1252 is "restrictive" but still produces mojibake before UTF-8 +3. **Turkish/Finnish ambiguity**: Historical encodings have legitimate sequences that look like UTF-8 mojibake +4. **No magic bullet**: Algorithmic complexity doesn't solve fundamental ambiguity -**Special case**: Don't trial decode with cp1252 even if it's OS default (8-bit uninformative). +### Next Steps 🎯 -## Impact on Existing APIs +**Decision point**: Continue with complex categorization OR simplify to user-centric approach? -### `detect_charset()` -- **No change** - Returns raw detector output -- Used when applications just want to know what chardet/charset-normalizer says +**Recommendation**: Simplify +- Remove complex permissive/restrictive/multi-byte categorization +- Use simple context-based trial order (User → OS → Python → Detection) +- Keep `is_permissive_charset()` only for optional filtering +- Document limitations honestly +- Empower users with supplement options -### `detect_charset_reliable()` (new) -- Validates suspicious (8-bit) or low-confidence detections -- **Part of public API** along with `detect_charset_confidence_reliable()` -- Used internally by `decode()` +## Charset Evaluation Results -### `decode()` -- **Major refactor** - New candidate selection logic -- Ignores MIME type entirely -- Uses helper functions to avoid monolithic implementation -- Reuses existing trial decode functions -- HTTP Content-Type: always falls back to detection on failure (not configurable) +Comprehensive testing of `chardet` vs `charset-normalizer`: -### `infer_*()` functions -- Minor updates may be needed later (defer for now) -- HTTP Content-Type with charset: trial decode only with specified charset +**Key findings:** +- charset-normalizer: 92% accurate on UTF-8, 17% on Latin-1/CP1252 +- chardet: 58% accurate on UTF-8, 83% on Latin-1/CP1252 +- Overall: Both tied at 65% accuracy +- charset-normalizer is slower but better for UTF-8 +- chardet is faster and better for legacy 8-bit encodings -### `trial_codecs` behavior parameter -- **Deprecated** - Document as ignored -- Keep in API for compatibility but don't use -- New situational logic replaces fixed codec lists +**Decision**: Stick with chardet for now, provides good balance. -## Charset-Normalizer Investigation +See: `.auxiliary/notes/charset-detector-evaluation-results.md` -Before implementing, test `charset-normalizer` vs `chardet`: +## Related Files -1. Compare on wide variety of byte patterns -2. Verify it "normalizes" to useful/standard encodings -3. Measure performance characteristics -4. Document findings +- Implementation: `sources/detextive/decoders.py`, `sources/detextive/charsets.py` +- Evaluations: `.auxiliary/evaluations/compare-charset-detectors.py` (and related) +- Results: `.auxiliary/notes/charset-detector-evaluation-results.md` -`charset-normalizer` is already in dev environment. +## Open Questions -## Related Issues +1. Should we simplify back to context-based trial order? +2. Keep or remove permissive/restrictive categorization? +3. How much complexity is justified for marginal accuracy gains? +4. What's the right balance between "smart" and "simple"? -### Windows Python 3.11+ Doctest Failure +## The Honest Documentation Approach -Current failure: -``` -Expected: 'Café ★' -Got: 'Café ★' +```python +""" +decode() attempts decoding in context-aware order: +1. User supplement (you know your data best) +2. OS default (sensible for local files) +3. Python default (usually UTF-8) +4. Detected charset (if confidence is high) + +Charset detection is heuristic and cannot solve fundamental +ambiguities without context. For best results: +- Provide charset_supplement when encoding is known +- Use http_content_type for web content +- Validate results with is_valid_text() +- Consider confidence scores from detect_charset_confidence() + +There is no magic bullet for charset detection. We provide +sensible defaults and give you control over the process. +""" ``` -Our code is producing UTF-8-as-cp1252 mojibake on Windows. The refactor should fix this by: -1. Detecting UTF-8 via `detect_charset_reliable()` -2. Trying UTF-8 (non-8-bit informative charset) -3. Successfully decoding and validating - -### Three Trial Codecs Usage Patterns - -Previously documented patterns become: -1. **Opportunistic Decoding** → New `decode()` logic -2. **Authoritative Validation** → HTTP Content-Type handling -3. **Detection Confirmation** → `detect_charset_reliable()` - -The fixed lists are replaced by situational logic based on charset properties. - -## Implementation Plan - -1. Implement and test `is_permissive_charset()` with caching -2. Implement `detect_charset_reliable()` -3. Refactor `decode()` with new candidate selection -4. Update documentation to deprecate `trial_codecs` -5. Test charset-normalizer vs chardet -6. Verify Windows Python 3.11+ doctests pass -7. Update architecture documentation - -## Resolved Design Questions - -1. **Authoritative charset failure**: Always fall back to detection (not configurable). Users who want exceptions can parse the header themselves and call `.decode()` directly. -2. **`detect_charset_reliable()` public API**: Yes, add both `detect_charset_reliable()` and `detect_charset_confidence_reliable()` to public API. -3. **`infer_*()` functions refactoring**: Defer for later; minor updates may be needed but not part of this refactor. -4. **Validation timing**: Respect existing `behaviors.text_validate` configuration; difference between during/after selection is minimal. -5. **Trust non-8-bit detections**: No, must consider confidence levels. Even multi-byte charsets can be misdetected; 7-bit (ASCII) is especially unreliable. -6. **Reuse existing functions**: Yes, use `attempt_decodes()` and existing trial decode helpers rather than reimplementing. - -## All Design Questions Resolved - -1. **Confidence threshold**: Use existing `behaviors.charset_confidence_threshold` from DTO -2. **Permissive charset caching**: Always enabled (no flag needed, minimal memory) -3. **Candidate prioritization**: Shortest always wins, user supplement is tie-breaker -4. **Multi-byte detection**: Test both ascending and descending byte sequences, check decoded length == 256 +**Complexity should serve users, not obscure limitations.** diff --git a/.auxiliary/notes/text-validation.md b/.auxiliary/notes/text-validation.md new file mode 100644 index 0000000..7db5c6d --- /dev/null +++ b/.auxiliary/notes/text-validation.md @@ -0,0 +1,200 @@ +# Text Validation and the Irrelevance of Confidence Thresholds + +## Summary + +The `text_validate_confidence` parameter should be removed. Text validation checks whether decoded content looks like real text (not binary data that happened to decode successfully). This is orthogonal to charset detection confidence and doesn't benefit from a confidence threshold. + +## Current Behavior + +### `text_validate_confidence` Usage + +Currently used in `_validate_text()` to decide whether to validate: + +```python +match behaviors.text_validate: + case BehaviorTristate.AsNeeded: + should_validate = confidence < behaviors.text_validate_confidence +``` + +Default threshold: 0.80 + +### Where Validation Is Called + +1. **From `_attempt_decode_http_content_type()`**: + - Passes `result.confidence` from `attempt_decodes()` + - This is size-based confidence: `confidence_from_bytes_quantity()` + +2. **From `_attempt_decodes()` (main decode path)**: + - Passes `0.0` confidence (hardcoded!) + - **Always validates** regardless of threshold + +## What Validation Actually Checks + +`PROFILE_TEXTUAL` (the default validation profile) checks: + +- **Control characters**: Only allows `\t`, `\n`, `\r`, plus bidi/zero-width/formatting characters +- **Rejectable categories**: Rejects Unicode categories: + - `Cc` (control characters) + - `Cf` (format characters) + - `Co` (private use) + - `Cs` (surrogate) +- **Printables ratio**: Requires ≥85% printable characters +- **Explicit rejects**: DELETE character (0x7F) + +### What Validation Catches + +Validation catches **binary/non-textual data that successfully decoded**: + +- Binary PNG data decoded as CP1252 → fails printables ratio +- UTF-16 data decoded as UTF-8 → produces garbage with control characters +- Random binary content decoded as ISO-8859-1 → fails printables ratio +- Mojibake from wrong charset → may contain unprintables + +**Key insight**: Any charset can decode binary data without raising `UnicodeDecodeError`. Validation is the only way to catch these false positives. + +## Why Confidence Threshold Doesn't Help + +### 1. Always 0.0 in Main Decode Path + +In `_attempt_decodes()`, confidence is hardcoded to `0.0`: + +```python +return _validate_text( + text, 0.0, # ← Always 0.0 + behaviors=behaviors, profile=profile, location=location) +``` + +This means: +- Validation **always runs** in the main decode path +- The `text_validate_confidence` threshold is never actually checked +- The parameter is effectively dead code for normal decoding + +### 2. Validation Is Not About Detection Quality + +Confidence reflects: "How sure are we this is the right charset?" + +Validation checks: "Does this look like real text?" + +These are **orthogonal concerns**: +- High-confidence UTF-8 detection can still produce mojibake if the actual charset was CP1252 +- Low-confidence detection on small sample might be correct and produce valid text +- Wrong charset with high confidence → valid-looking text that happens to be garbage + +### 3. Sample Size Doesn't Reduce Need for Validation + +The argument for confidence threshold might be: +> "Large files with high-confidence detection don't need validation" + +But this is wrong because: +- Large binary files (images, executables) can still decode as text +- HTTP headers can lie about charset +- Validation is cheap (character category checks) +- Better to validate anyway + +### 4. Any Charset Can Encode Binary Data + +All charsets can represent control characters: +- UTF-8: `\x00`, `\x01`, `\x02`, etc. +- CP1252: Control chars in 0x00-0x1F range +- ISO-8859-1: Fully permissive, decodes everything + +There's no charset-based reason to skip validation. + +## Proposed Changes + +### Remove Confidence Threshold + +Change validation logic from: + +```python +match behaviors.text_validate: + case BehaviorTristate.AsNeeded: + should_validate = confidence < behaviors.text_validate_confidence +``` + +To: + +```python +match behaviors.text_validate: + case BehaviorTristate.AsNeeded: + should_validate = True # Always validate when AsNeeded +``` + +Or simplify the tristate entirely: +- `Always`: Validate +- `Never`: Don't validate +- `AsNeeded`: **Remove** (was equivalent to Always in practice) + +### Simplify to Boolean + +Even simpler option: + +```python +class Behaviors: + text_validate: bool = True # Just True/False +``` + +But keeping the tristate maintains API compatibility and clarity: +- `Always`: Validate (explicit) +- `AsNeeded`: Validate (matches current behavior) +- `Never`: Don't validate (opt-out for performance) + +### Remove Parameter + +Delete from `Behaviors`: + +```python +text_validate_confidence: float = 0.80 # ← Remove this +``` + +### Update Signature + +`_validate_text()` can keep the confidence parameter for now (for backward compatibility in internal calls), but ignore it: + +```python +def _validate_text( + text: str, confidence: float, /, *, # confidence unused + behaviors: BehaviorsArgument, + profile: ProfileArgument, + location: LocationArgument, +) -> str: + # Don't check confidence, just validate based on tristate + ... +``` + +Or remove it entirely and update all call sites. + +## Why Validation Is Important + +Validation is **critical** for detextive's reliability: + +1. **Catches wrong charsets**: ISO-8859-1 can decode UTF-8 as mojibake +2. **Catches binary data**: Images, executables, etc. that decode without errors +3. **Provides meaningful errors**: Better to fail with "TextInvalidity" than return garbage +4. **Aligns with design philosophy**: "Honest about limitations" → validate results + +## Performance Considerations + +**Validation is cheap**: +- Samples only first 8192 characters by default (`profile.sample_quantity`) +- Character category lookup is O(1) with Unicode data +- Ratio calculations are simple arithmetic +- Negligible compared to charset detection + +**No need to skip validation for performance.** + +## Recommendation + +1. **Remove `text_validate_confidence` parameter** from `Behaviors` +2. **Keep `text_validate` tristate** for user control +3. **Always validate when `AsNeeded`** (remove confidence check) +4. **Update documentation** to clarify that validation is about textuality, not confidence +5. **Update vulturefood.py** to remove `text_validate_confidence` entry + +This simplifies the API, removes dead code, and aligns behavior with actual needs. + +## Related Documents + +- `.auxiliary/notes/confidence.md` - Confidence scoring strategy +- `.auxiliary/notes/decode-refactor.md` - Design philosophy and simplification +- `sources/detextive/validation.py` - Validation profiles and logic diff --git a/decode-refactor--progress.md b/decode-refactor--progress.md new file mode 100644 index 0000000..a121df9 --- /dev/null +++ b/decode-refactor--progress.md @@ -0,0 +1,63 @@ +# Decode Refactor Progress + +## Overview + +This document tracks the progress of the `decode` function refactor, aiming to simplify the charset detection and decoding process by moving towards a context-aware trial order (User -> OS -> Python -> Detection) and reducing complexity around permissive/restrictive categorization. + +## Comparisons: `decode-refactor` vs `master` + +### `sources/detextive/decoders.py` +- **Refactored `decode` function**: Now implements the simplified logic. +- **New `_attempt_decodes`**: Implements the trial order: + 1. Prepares charsets using `_prepare_charsets` (User, OS, Python, Detection). + 2. Splits candidates into `restrictives` and `permissives`. + 3. Tries `restrictives` first, then `permissives`. +- **New `_prepare_charsets`**: Collects charsets and sorts them into permissive/restrictive lists based on `is_permissive_charset`. +- **New `_validate_text`**: Centralized text validation logic. +- **BOM Handling**: Uses `behaviors.remove_bom` to normalize charsets (e.g., `utf-8` -> `utf-8-sig`). +- **TODO**: Deprecation warnings for `mimetype_*` arguments. + +### `sources/detextive/charsets.py` +- **`is_permissive_charset`**: Added to identify charsets that accept all byte sequences (e.g., ISO-8859-*). +- **`attempt_decodes`**: Updated to use `set` for trials and `normalize_charset` with `bom_cognizant`. +- **`normalize_charset`**: Added `bom_cognizant` parameter. +- **`_charsets_permissive`**: Cache for permissive checks. +- **TODO**: Accretive dictionary comment. + +### `sources/detextive/detectors.py` +- **`detect_charset_confidence`**: Defaults to `default` charset instead of hardcoded 'utf-8' when content is empty. + +### `sources/detextive/inference.py` +- **`validate_httpct_charset`**: Added helper. + +### `sources/detextive/core.py` +- **`Behaviors`**: Added `remove_bom` field (default `True`). + +## Current Status + +The simplified design described in `.auxiliary/notes/decode-refactor.md` has been largely implemented. The logic follows the "Context-based trial order". + +## Issues & Remaining Work + +1. **Test Failures**: + - `tests/test_000_detextive/test_220_charsets.py`: `test_220_codec_specifiers_user_supplement` fails. + - Expects `utf-8`, gets `utf-8-sig`. + - Caused by `behaviors.remove_bom=True` default and `normalize_charset` converting `utf-8` to `utf-8-sig`. + - Action: Update test to expect `utf-8-sig` or allow configuring `remove_bom` in test. + +2. **Code Cleanup**: + - Address `TODO` in `decoders.py`: Deprecation warnings for `mimetype_*`. + - Address `TODO` in `charsets.py`: Accretive dictionary. + +3. **Verification**: + - Ensure the new `decode` logic in `decoders.py` is properly tested. Current tests might be testing `charsets.attempt_decodes` which is used by `_attempt_decode_http_content_type` but the main `decode` path uses `decoders._attempt_decodes`. + +4. **Refactor Review**: + - `decoders.py` has a local `_attempt_decodes` and `charsets.py` has `attempt_decodes`. This duplication/naming should be reviewed. `charsets.attempt_decodes` is still used for HTTP content type decoding. + +## Next Steps + +1. Fix the failing test `test_220_charsets.py`. +2. Implement deprecation warnings. +3. Add tests specifically covering the new `decoders.decode` logic and trial order. +4. Verify behavior with CP1252 vs UTF-8 scenarios as noted in the design docs. diff --git a/sources/detextive/__/imports.py b/sources/detextive/__/imports.py index ff4cf81..5dd8e79 100644 --- a/sources/detextive/__/imports.py +++ b/sources/detextive/__/imports.py @@ -24,6 +24,7 @@ import collections.abc as cabc import codecs +import contextlib as ctxl import dataclasses as dcls import enum import locale diff --git a/sources/detextive/charsets.py b/sources/detextive/charsets.py index 79352df..090fe6c 100644 --- a/sources/detextive/charsets.py +++ b/sources/detextive/charsets.py @@ -35,7 +35,10 @@ ) -def attempt_decodes( +_charsets_permissive: dict[ str, bool ] = { } # TODO: Accretive dictionary. + + +def attempt_decodes( # noqa: PLR0915 content: _nomina.Content, /, *, behaviors: _Behaviors = _BEHAVIORS_DEFAULT, inference: __.Absential[ str ] = __.absent, @@ -50,7 +53,7 @@ def attempt_decodes( confidence = _core.confidence_from_bytes_quantity( content, behaviors = behaviors ) on_decode_error = behaviors.on_decode_error - trials: list[ str ] = [ ] + trials: set[ str ] = set( ) for codec in behaviors.trial_codecs: match codec: case _CodecSpecifiers.FromInference: @@ -65,14 +68,16 @@ def attempt_decodes( charset = supplement case str( ): charset = codec case _: continue + charset = normalize_charset( + charset, bom_cognizant = behaviors.remove_bom ) + if charset in trials: continue try: text = content.decode( charset, errors = on_decode_error ) - except UnicodeDecodeError: - trials.append( charset ) - continue + except UnicodeDecodeError: continue + finally: trials.add( charset ) result = _CharsetResult( charset = charset, confidence = confidence ) return text, result raise _exceptions.ContentDecodeFailure( - charset = trials, location = location ) + charset = tuple( trials ), location = location ) def discover_os_charset_default( ) -> str: @@ -82,9 +87,33 @@ def discover_os_charset_default( ) -> str: return normalize_charset( discoverer( ) ) -def normalize_charset( charset: str ) -> str: +def is_permissive_charset( charset: str ) -> bool: + ''' Checks if charset accepts all byte sequences (8-bit encoding). + + Returns ``True`` for ISO-8859-*, etc.... + Returns ``False`` for ASCII, CP1252, UTF-8, SHIFT-JIS, etc.... + ''' + charset_ = normalize_charset( charset ) + if charset_ in _charsets_permissive: + return _charsets_permissive[ charset_ ] + try: + texta = bytes( range( 256 ) ).decode( + charset_, errors = 'strict' ) + textd = bytes( range( 255, -1, -1 ) ).decode( + charset_, errors = 'strict' ) + except ( UnicodeDecodeError, LookupError ): + _charsets_permissive[ charset_ ] = False + return False + permissivity = ( len( texta ) == len( textd ) == 256 ) # noqa: PLR2004 + _charsets_permissive[ charset_ ] = permissivity + return permissivity + + +def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str: ''' Normalizes character set encoding names. ''' - return __.codecs.lookup( charset ).name + charset_ = __.codecs.lookup( charset ).name + if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig' + return charset_ def trial_decode_as_confident( # noqa: PLR0913 diff --git a/sources/detextive/core.py b/sources/detextive/core.py index a64f79c..7691eea 100644 --- a/sources/detextive/core.py +++ b/sources/detextive/core.py @@ -114,6 +114,9 @@ class Behaviors( __.immut.DataclassObject ): 'codecs' module. ''' ), ] = 'strict' + remove_bom: __.typx.Annotated[ + bool, __.ddoc.Doc( ''' Remove byte-ordering mark? ''' ) + ] = True text_validate: __.typx.Annotated[ BehaviorTristate, __.ddoc.Doc( ''' When to validate text. ''' ), diff --git a/sources/detextive/decoders.py b/sources/detextive/decoders.py index 5685e23..eb3f9aa 100644 --- a/sources/detextive/decoders.py +++ b/sources/detextive/decoders.py @@ -23,10 +23,9 @@ from . import __ from . import charsets as _charsets -from . import core as _core +from . import detectors as _detectors from . import exceptions as _exceptions from . import inference as _inference -from . import mimetypes as _mimetypes from . import nomina as _nomina from . import validation as _validation @@ -37,6 +36,7 @@ BehaviorTristate as _BehaviorTristate, BehaviorsArgument as _BehaviorsArgument, CharsetResult as _CharsetResult, + CodecSpecifiers as _CodecSpecifiers, ) @@ -52,49 +52,172 @@ def decode( # noqa: PLR0913 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, ) -> str: ''' Decodes bytes array to Unicode text. ''' + # TODO: Deprecation warnings for 'mimetype_*' arguments. if content == b'': return '' - behaviors_ = __.dcls.replace( - behaviors, trial_decode = _BehaviorTristate.Never ) - try: - mimetype_result, charset_result = ( - _inference.infer_mimetype_charset_confidence( + result: __.Absential[ _CharsetResult ] = __.absent + text: __.Absential[ str ] = __.absent + if not __.is_absent( http_content_type ): + text = _attempt_decode_http_content_type( + content, http_content_type, + behaviors = behaviors, profile = profile, location = location ) + if not __.is_absent( text ): return text + if __.is_absent( result ): + behaviors_ = __.dcls.replace( + behaviors, trial_decode = _BehaviorTristate.Never ) + with __.ctxl.suppress( _exceptions.CharsetDetectFailure ): + result = _detectors.detect_charset_confidence( content, behaviors = behaviors_, - charset_default = charset_default, - mimetype_default = mimetype_default, - http_content_type = http_content_type, - charset_supplement = charset_supplement, - mimetype_supplement = mimetype_supplement, - location = location ) ) - except _exceptions.Omnierror: - charset = ( - 'utf-8-sig' if __.is_absent( charset_supplement ) - else charset_supplement ) - confidence = _core.confidence_from_bytes_quantity( content, behaviors ) - charset_result = _CharsetResult( - charset = charset, confidence = confidence ) - else: - if ( not _mimetypes.is_textual_mimetype( mimetype_result.mimetype ) - and charset_result.charset is None - ): raise _exceptions.ContentDecodeImpossibility( location = location ) - # When any reasonable doubt exists, we attempt decodes. - # Trial decodes and text validation is the only way to be certain. - text, result = _charsets.attempt_decodes( - content, + default = charset_default, + supplement = charset_supplement, + location = location ) + return _attempt_decodes( + content, result, behaviors = behaviors, - inference = ( - 'utf-8-sig' if charset_result.charset is None - else charset_result.charset ), + profile = profile, supplement = charset_supplement, location = location ) + + +def _attempt_decode_http_content_type( + content: _nomina.Content, + http_content_type: str, /, *, + behaviors: _BehaviorsArgument, + profile: _validation.ProfileArgument, + location: _nomina.LocationArgument, +) -> __.Absential[ str ]: + charset: __.Absential[ __.typx.Optional[ str ] ] = __.absent + result: __.Absential[ _CharsetResult ] = __.absent + error = _exceptions.ContentDecodeImpossibility( location = location ) + _, charset = _inference.parse_http_content_type( http_content_type ) + if charset is None: raise error + if __.is_absent( charset ): return __.absent + behaviors_ = __.dcls.replace( + behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) + try: + text, result = _charsets.attempt_decodes( + content, + behaviors = behaviors_, inference = charset, location = location ) + except _exceptions.ContentDecodeFailure: return __.absent + # Allow other errors propagate. + if not __.is_absent( text ) and not __.is_absent( result ): + return _validate_text( + text, result.confidence, + behaviors = behaviors, profile = profile, location = location ) + return __.absent + + +def _append_charset( + permissives: list[ str ], + restrictives: list[ str ], + charset: str, + bom_cognizant: bool, +) -> None: + charset_ = _charsets.normalize_charset( + charset, bom_cognizant = bom_cognizant ) + if _charsets.is_permissive_charset( charset_ ): + if charset_ in permissives: return + permissives.append( charset_ ) + else: + if charset_ in restrictives: return + restrictives.append( charset_ ) + + +def _attempt_decodes( # noqa: PLR0913 + content: _nomina.Content, + detection: __.Absential[ _CharsetResult ], /, *, + behaviors: _BehaviorsArgument, + profile: _validation.ProfileArgument, + supplement: __.Absential[ str ], + location: _nomina.LocationArgument, +) -> str: + error = _exceptions.ContentDecodeImpossibility( location = location ) + permissives, restrictives = _prepare_charsets( + detection, behaviors = behaviors, supplement = supplement ) + on_decode_error = behaviors.on_decode_error + # Try restrictive charsets before permissive charsets, since: + # (1) Restrictive charsets can have decoding errors from invalid byte + # sequences. + # (2) Restrictive charsets can produce shorter strings, if the are + # multi-byte encodings. Permissive charsets decoding the same byte + # sequences will likly result in mojibake. + for charset in restrictives: + try: text = content.decode( charset, errors = on_decode_error ) + except UnicodeDecodeError: continue + try: + return _validate_text( + text, 0.0, + behaviors = behaviors, profile = profile, location = location ) + except _exceptions.TextInvalidity: continue + for charset in permissives: + try: text = content.decode( charset, errors = on_decode_error ) + except UnicodeDecodeError: continue + try: + return _validate_text( + text, 0.0, + behaviors = behaviors, profile = profile, location = location ) + except _exceptions.TextInvalidity: continue + raise error + + +def _prepare_charsets( + detection: __.Absential[ _CharsetResult ], /, *, + behaviors: _BehaviorsArgument, + supplement: __.Absential[ str ], +) -> tuple[ tuple[ str, ... ], tuple[ str, ... ] ]: + permissives: list[ str ] = [ ] + restrictives: list[ str ] = [ ] + os_charset = _charsets.discover_os_charset_default( ) + _append_charset( + permissives, restrictives, os_charset, behaviors.remove_bom ) + python_charset = __.locale.getpreferredencoding( ) + _append_charset( + permissives, restrictives, python_charset, behaviors.remove_bom ) + if not __.is_absent( supplement ): + _prepend_charset( + permissives, restrictives, supplement, behaviors.remove_bom ) + if not __.is_absent( detection ) and detection.charset is not None: + # Suspicious charset detections go at end. + if detection.confidence < behaviors.trial_decode_confidence: + _append_charset( + permissives, restrictives, detection.charset, + behaviors.remove_bom ) + else: + _prepend_charset( + permissives, restrictives, detection.charset, + behaviors.remove_bom ) + return tuple( permissives ), tuple( restrictives ) + + +def _prepend_charset( + permissives: list[ str ], + restrictives: list[ str ], + charset: str, + bom_cognizant: bool, +) -> None: + charset_ = _charsets.normalize_charset( + charset, bom_cognizant = bom_cognizant ) + if _charsets.is_permissive_charset( charset_ ): + if charset_ in permissives: return + permissives.insert( 0, charset_ ) + else: + if charset_ in restrictives: return + restrictives.insert( 0, charset_ ) + + +def _validate_text( + text: str, confidence: float, /, *, + behaviors: _BehaviorsArgument, + profile: _validation.ProfileArgument, + location: _nomina.LocationArgument, +) -> str: + error = _exceptions.TextInvalidity( location = location ) should_validate = False match behaviors.text_validate: case _BehaviorTristate.Always: should_validate = True case _BehaviorTristate.AsNeeded: - should_validate = ( - result.confidence < behaviors.text_validate_confidence ) + should_validate = confidence < behaviors.text_validate_confidence case _BehaviorTristate.Never: pass - if should_validate and not profile( text ): - raise _exceptions.TextInvalidity( location = location ) + if should_validate and not profile( text ): raise error return text diff --git a/sources/detextive/detectors.py b/sources/detextive/detectors.py index fff8000..ef89984 100644 --- a/sources/detextive/detectors.py +++ b/sources/detextive/detectors.py @@ -116,7 +116,7 @@ def detect_charset_confidence( # noqa: PLR0913 ) -> _CharsetResult: ''' Detects character set candidates with confidence scores. ''' if b'' == content: - return _CharsetResult( charset = 'utf-8', confidence = 1.0 ) + return _CharsetResult( charset = default, confidence = 1.0 ) for name in behaviors.charset_detectors_order: detector = charset_detectors.get( name ) if detector is None: continue diff --git a/sources/detextive/inference.py b/sources/detextive/inference.py index d77b32b..7d1cff4 100644 --- a/sources/detextive/inference.py +++ b/sources/detextive/inference.py @@ -200,6 +200,17 @@ def parse_http_content_type( return __.absent, __.absent +def validate_httpct_charset( + content: _nomina.Content, + charset: str, /, *, + behaviors: _Behaviors = _BEHAVIORS_DEFAULT, +) -> __.Absential[ _CharsetResult ]: + behaviors_ = __.dcls.replace( + behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) + return _charsets.trial_decode_as_confident( + content, behaviors = behaviors_, inference = charset ) + + def _determine_parse_detect( detect_tristate: _BehaviorTristate, should_parse = False ) -> tuple[ bool, bool ]: @@ -229,14 +240,8 @@ def _validate_http_content_type( elif charset is None: charset_result = _CharsetResult( charset = None, confidence = 0.9 ) else: - # HTTP header provides explicit charset - only try that, not OS default - behaviors_http = __.dcls.replace( - behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) - charset_result = _charsets.trial_decode_as_confident( - content, - behaviors = behaviors_http, - inference = charset, - supplement = charset_supplement ) + charset_result = validate_httpct_charset( + content, charset, behaviors = behaviors ) if __.is_absent( mimetype ): mimetype_result = __.absent else: mimetype_result = _MimetypeResult(