diff --git a/documentation/architecture/designs/001-python-api.rst b/documentation/architecture/designs/001-python-api.rst deleted file mode 100644 index 6718867..0000000 --- a/documentation/architecture/designs/001-python-api.rst +++ /dev/null @@ -1,463 +0,0 @@ -.. vim: set fileencoding=utf-8: -.. -*- coding: utf-8 -*- -.. +--------------------------------------------------------------------------+ - | | - | Licensed under the Apache License, Version 2.0 (the "License"); | - | you may not use this file except in compliance with the License. | - | You may obtain a copy of the License at | - | | - | http://www.apache.org/licenses/LICENSE-2.0 | - | | - | Unless required by applicable law or agreed to in writing, software | - | distributed under the License is distributed on an "AS IS" BASIS, | - | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | - | See the License for the specific language governing permissions and | - | limitations under the License. | - | | - +--------------------------------------------------------------------------+ - - -******************************************************************************* -001. Python API Specification -******************************************************************************* - -Overview -=============================================================================== - -This document specifies the Python API implementing context-aware -text detection with pluggable backend support, confidence-based detection, -and optional dependency architecture. - -The design follows established project practices for interface contracts, -module organization, naming conventions, and provides both simple string-based -APIs and confidence-aware APIs with structured result types. - -Public Interface Specification -=============================================================================== - -Core Type Definitions -------------------------------------------------------------------------------- - -**Confidence-Based Result Types** - -.. code-block:: python - - class CharsetResult( __.immut.DataclassObject ): - ''' Character set encoding with detection confidence. ''' - - charset: __.typx.Annotated[ - __.typx.Optional[ str ], - __.ddoc.Doc( ''' Detected character set encoding. May be None. ''' ), - ] - confidence: __.typx.Annotated[ - float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) - ] - - class MimetypeResult( __.immut.DataclassObject ): - ''' MIME type with detection confidence. ''' - - mimetype: __.typx.Annotated[ - str, __.ddoc.Doc( ''' Detected MIME type. ''' ) - ] - confidence: __.typx.Annotated[ - float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) - ] - - -**Configuration Types** - -.. code-block:: python - - class BehaviorTristate( __.enum.Enum ): - ''' When to apply behavior. ''' - - Never = __.enum.auto( ) - AsNeeded = __.enum.auto( ) - Always = __.enum.auto( ) - - class DetectFailureActions( __.enum.Enum ): - ''' Possible responses to detection failure. ''' - - Default = __.enum.auto( ) - Error = __.enum.auto( ) - - class CodecSpecifiers( __.enum.Enum ): - ''' Specifiers for dynamic codecs. ''' - - FromInference = __.enum.auto( ) - OsDefault = __.enum.auto( ) - PythonDefault = __.enum.auto( ) - UserSupplement = __.enum.auto( ) - - class Behaviors( __.immut.DataclassObject ): - ''' How functions behave. ''' - - charset_detectors_order: __.typx.Annotated[ - __.cabc.Sequence[ str ], - __.ddoc.Doc( ''' Order in which charset detectors are applied. ''' ), - ] = ( 'chardet', 'charset-normalizer' ) - - charset_on_detect_failure: __.typx.Annotated[ - DetectFailureActions, - __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ), - ] = DetectFailureActions.Default - - mimetype_detectors_order: __.typx.Annotated[ - __.cabc.Sequence[ str ], - __.ddoc.Doc( ''' Order in which MIME type detectors are applied. ''' ), - ] = ( 'magic', 'puremagic' ) - - mimetype_on_detect_failure: __.typx.Annotated[ - DetectFailureActions, - __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ), - ] = DetectFailureActions.Default - - charset_detect: __.typx.Annotated[ - BehaviorTristate, - __.ddoc.Doc( ''' When to detect charset from content. ''' ), - ] = BehaviorTristate.AsNeeded - - mimetype_detect: __.typx.Annotated[ - BehaviorTristate, - __.ddoc.Doc( ''' When to detect MIME type from content. ''' ), - ] = BehaviorTristate.AsNeeded - -Simple String-Based Detection Functions -------------------------------------------------------------------------------- - -**Character Encoding Detection** - -.. code-block:: python - - def detect_charset( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - default: str = CHARSET_DEFAULT, - supplement: __.Absential[ str ] = __.absent, - mimetype: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - ) -> __.typx.Optional[ str ]: - ''' Detects character encoding. - - Returns the most likely character encoding. When configured for - default return behavior, returns the default value on detection - failure rather than raising an exception. - ''' - - def detect_mimetype( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - default: str = MIMETYPE_DEFAULT, - charset: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - ) -> str: - ''' Detects MIME type. - - Returns the most likely MIME type. When configured for default - return behavior, returns the default value on detection failure - rather than raising an exception. - ''' - -**Inference Functions with Context Support** - -.. code-block:: python - - def infer_charset( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - charset_default: str = CHARSET_DEFAULT, - http_content_type: __.Absential[ str ] = __.absent, - charset_supplement: __.Absential[ str ] = __.absent, - mimetype_supplement: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - ) -> __.typx.Optional[ str ]: - ''' Infers charset through various means. - - Utilizes HTTP Content-Type headers, location hints, and content - analysis for contextual charset inference. Supports configurable - default return behavior on inference failure. - ''' - - def infer_mimetype_charset( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - charset_default: str = CHARSET_DEFAULT, - mimetype_default: str = MIMETYPE_DEFAULT, - http_content_type: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - charset_supplement: __.Absential[ str ] = __.absent, - mimetype_supplement: __.Absential[ str ] = __.absent, - ) -> tuple[ str, __.typx.Optional[ str ] ]: - ''' Detects MIME type and charset with context support. - - Returns tuple of (mimetype, charset). Provides comprehensive - detection utilizing all available context with configurable - default behavior on detection failure. - ''' - -Confidence-Based Detection Functions -------------------------------------------------------------------------------- - -**Core Confidence Functions** - -.. code-block:: python - - def detect_charset_confidence( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - default: str = CHARSET_DEFAULT, - supplement: __.Absential[ str ] = __.absent, - mimetype: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - ) -> CharsetResult: - ''' Detects character encoding with confidence scoring. - - Returns CharsetResult with charset and confidence level. When - configured for default return behavior, returns default value - with zero confidence on detection failure. - ''' - - def detect_mimetype_confidence( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - default: str = MIMETYPE_DEFAULT, - charset: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - ) -> MimetypeResult: - ''' Detects MIME type with confidence scoring. - - Returns MimetypeResult with mimetype and confidence level. When - configured for default return behavior, returns default value - with zero confidence on detection failure. - ''' - -**Advanced Confidence Inference** - -.. code-block:: python - - def infer_charset_confidence( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - charset_default: str = CHARSET_DEFAULT, - http_content_type: __.Absential[ str ] = __.absent, - charset_supplement: __.Absential[ str ] = __.absent, - mimetype_supplement: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - ) -> CharsetResult: - ''' Infers charset with confidence through various means. - - Utilizes contextual information for enhanced detection quality. - Supports configurable default return behavior on inference failure. - ''' - - def infer_mimetype_charset_confidence( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - charset_default: str = CHARSET_DEFAULT, - mimetype_default: str = MIMETYPE_DEFAULT, - http_content_type: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - charset_supplement: __.Absential[ str ] = __.absent, - mimetype_supplement: __.Absential[ str ] = __.absent, - ) -> tuple[ MimetypeResult, CharsetResult ]: - ''' Detects MIME type and charset with confidence scoring. - - Returns tuple of (MimetypeResult, CharsetResult) with full - confidence information for both detection results. Supports - configurable default behavior on detection failure. - ''' - -**Confidence Utility Functions** - -.. code-block:: python - - def confidence_from_bytes_quantity( - content: Content, - behaviors: Behaviors = BEHAVIORS_DEFAULT - ) -> float: - ''' Calculates confidence score based on content length. - - Returns confidence value from 0.0 to 1.0 based on the amount - of content available for analysis. - ''' - -High-Level Decoding and Validation -------------------------------------------------------------------------------- - -**Content Decoding** - -.. code-block:: python - - def decode( - content: Content, /, *, - behaviors: Behaviors = BEHAVIORS_DEFAULT, - profile: TextValidationProfile = PROFILE_TEXTUAL, - charset_default: str = CHARSET_DEFAULT, - mimetype_default: str = MIMETYPE_DEFAULT, - http_content_type: __.Absential[ str ] = __.absent, - location: __.Absential[ Location ] = __.absent, - charset_supplement: __.Absential[ str ] = __.absent, - mimetype_supplement: __.Absential[ str ] = __.absent, - ) -> str: - ''' High-level bytes-to-text decoding with validation. - - Performs comprehensive detection, decoding, and validation - for robust text extraction from byte content. Supports - configurable default values for graceful degradation. - ''' - -**Textual Content Validation** - -.. code-block:: python - - def is_textual_mimetype( mimetype: str ) -> bool: - ''' Validates if MIME type represents textual content. - - Returns True for MIME types representing textual content. - ''' - - def is_valid_text( - text: str, - profile: TextValidationProfile = PROFILE_TEXTUAL - ) -> bool: - ''' Unicode-aware text validation with configurable profiles. - - Returns True for text meeting the specified validation profile. - ''' - -Line Separator Processing -------------------------------------------------------------------------------- - -**LineSeparators Enum** (unchanged from v1.x specification) - -.. code-block:: python - - class LineSeparators( __.enum.Enum ): - ''' Line separators for cross-platform text processing. ''' - - CR = '\r' # Classic MacOS (0xD) - CRLF = '\r\n' # DOS/Windows (0xD 0xA) - LF = '\n' # Unix/Linux (0xA) - - @classmethod - def detect_bytes( - selfclass, - content: __.cabc.Sequence[ int ] | bytes, - limit: int = 1024 - ) -> __.typx.Optional[ 'LineSeparators' ]: - ''' Detects line separator from byte content sample. ''' - - @classmethod - def normalize_universal( selfclass, content: str ) -> str: - ''' Normalizes all line separators to Unix LF format. ''' - - def normalize( self, content: str ) -> str: - ''' Normalizes specific line separator to Unix LF format. ''' - - def nativize( self, content: str ) -> str: - ''' Converts Unix LF to this platform's line separator. ''' - -Type Annotation Patterns -=============================================================================== - -**Module Constants:** - -.. code-block:: python - - CHARSET_DEFAULT: str = 'utf-8' - MIMETYPE_DEFAULT: str = 'application/octet-stream' - -**Common Type Aliases:** - -.. code-block:: python - - Content: __.typx.TypeAlias = __.typx.Annotated[ - bytes, - __.ddoc.Doc( "Raw byte content for analysis." ) - ] - - Location: __.typx.TypeAlias = __.typx.Annotated[ - str | __.pathlib.Path, - __.ddoc.Doc( "File path or URL for detection context." ) - ] - -**Absential Pattern for Context Parameters:** -- Distinguish "not provided" (absent) from "explicitly None" -- Enable three-state parameters: absent | None | value -- Support complex context handling for HTTP headers and supplements - -**Return Type Patterns:** -- Simple APIs return `str` or `__.typx.Optional[ str ]` -- Confidence APIs return structured types: `CharsetResult`, `MimetypeResult` -- Combined APIs return immutable tuples: `tuple[ MimetypeResult, CharsetResult ]` -- Default return behavior: confidence = 0.0 indicates detection failure with fallback value - -**Default Return Behavior Pattern:** -- `DetectFailureActions.Default`: Return default value with zero confidence -- `DetectFailureActions.Error`: Raise appropriate exception (legacy behavior) -- All detection functions accept `default` parameters for graceful degradation - - -Exception Hierarchy Design -=============================================================================== - -Following Omnierror Pattern -------------------------------------------------------------------------------- - -.. code-block:: python - - class Omniexception( - __.immut.Object, BaseException, - instances_visibles = ( - '__cause__', '__context__', __.is_public_identifier ), - ): - ''' Base for all exceptions raised by package API. ''' - - class Omnierror( Omniexception, Exception ): - ''' Base for error exceptions raised by package API. ''' - - # Detection-specific exceptions - class CharsetDetectFailure( Omnierror, TypeError, ValueError ): - ''' Raised when character encoding detection fails. ''' - - class CharsetInferFailure( Omnierror, TypeError, ValueError ): - ''' Raised when character encoding inference fails. ''' - - class MimetypeDetectFailure( Omnierror, TypeError, ValueError ): - ''' Raised when MIME type detection fails. ''' - - class ContentDecodeFailure( Omnierror, UnicodeError ): - ''' Raised when content cannot be decoded with detected charset. ''' - -**Exception Design Principles:** -- Follow nomenclature patterns: `Failure` -- Inherit from appropriate built-in exception types -- Support location context in error messages -- Enable package-wide exception catching via `Omnierror` - -Implementation Considerations -=============================================================================== - -Context-Aware Detection Strategy -------------------------------------------------------------------------------- - -**Detection Priority Order:** -1. HTTP Content-Type headers (when available) -2. Location/filename extension analysis -3. Magic bytes content analysis -4. Fallback to defaults based on available information - -**Registry-Based Backend Selection:** -- Configurable detector precedence via `Behaviors` -- Dynamic fallback when detectors return `NotImplemented` -- Support for multiple optional dependencies per detection type - -**Confidence Integration:** -- Length-based confidence calculation -- Backend-specific confidence scoring -- AsNeeded behavior triggering based on confidence thresholds - -**Performance Characteristics:** -- Lazy evaluation of detection operations -- Sample-based analysis for large content -- Minimal abstraction preserving detector performance \ No newline at end of file diff --git a/documentation/architecture/designs/002-detector-registry.rst b/documentation/architecture/designs/002-detector-registry.rst deleted file mode 100644 index 4095fae..0000000 --- a/documentation/architecture/designs/002-detector-registry.rst +++ /dev/null @@ -1,242 +0,0 @@ -.. vim: set fileencoding=utf-8: -.. -*- coding: utf-8 -*- -.. +--------------------------------------------------------------------------+ - | | - | Licensed under the Apache License, Version 2.0 (the "License"); | - | you may not use this file except in compliance with the License. | - | You may obtain a copy of the License at | - | | - | http://www.apache.org/licenses/LICENSE-2.0 | - | | - | Unless required by applicable law or agreed to in writing, software | - | distributed under the License is distributed on an "AS IS" BASIS, | - | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | - | See the License for the specific language governing permissions and | - | limitations under the License. | - | | - +--------------------------------------------------------------------------+ - - -******************************************************************************* -002. Detector Registry Specification -******************************************************************************* - -Overview -=============================================================================== - -This document specifies the detector registry architecture for pluggable -backend support in the detextive library. The registry system enables -configurable detector precedence, graceful degradation with optional -dependencies, and dynamic fallback strategies for robust detection across -diverse environments. - -The design follows established project practices for type aliases, interface -contracts, and module organization while providing extensibility for -third-party detection backends. - -Registry Architecture -=============================================================================== - -Core Registry Types -------------------------------------------------------------------------------- - -**Detector Function Signatures** - -.. code-block:: python - - CharsetDetector: __.typx.TypeAlias = __.cabc.Callable[ - [ Content, Behaviors ], - CharsetResult | __.types.NotImplementedType - ] - - MimetypeDetector: __.typx.TypeAlias = __.cabc.Callable[ - [ Content, Behaviors ], - MimetypeResult | __.types.NotImplementedType - ] - -**Registry Container Types** - -.. code-block:: python - - charset_detectors: __.accret.Dictionary[ str, CharsetDetector ] - mimetype_detectors: __.accret.Dictionary[ str, MimetypeDetector ] - -**Registry Contract Specifications:** -- Detectors return specific result types with confidence scoring -- `NotImplemented` return value indicates missing optional dependency -- Registry keys provide user-configurable detector ordering -- Detector functions accept standardized parameters for consistent interfaces - -Registry Registration Pattern -------------------------------------------------------------------------------- - -**Dynamic Registration System** - -.. code-block:: python - - def _detect_via_chardet( - content: Content, behaviors: Behaviors - ) -> CharsetResult | __.types.NotImplementedType: - ''' Detects charset using chardet library. ''' - try: - from chardet import detect as _chardet_detect - except ImportError: - return NotImplemented - - # Detection implementation would follow here - - def _detect_via_charset_normalizer( - content: Content, behaviors: Behaviors - ) -> CharsetResult | __.types.NotImplementedType: - ''' Detects charset using charset-normalizer library. ''' - try: - from charset_normalizer import from_bytes - except ImportError: - return NotImplemented - - # Detection implementation would follow here - - # Registration at module initialization - charset_detectors[ 'chardet' ] = _detect_via_chardet - charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer - -**Registration Design Principles:** -- Lazy import strategy with graceful ImportError handling -- Consistent function signature across all detector implementations -- Registry key naming matches common library names for intuitive configuration -- Module-level registration enables import-time detector discovery - -Optional Dependency Strategy -=============================================================================== - -Graceful Degradation Pattern -------------------------------------------------------------------------------- - -**NotImplemented Return Protocol** - -The registry system implements graceful degradation where: -- Detectors return `NotImplemented` for missing optional dependencies -- Registry iteration continues until successful detection -- Exception raising occurs only when all configured detectors fail -- User-configurable detector ordering enables fallback preferences - -Configuration Integration -------------------------------------------------------------------------------- - -**Behavior-Driven Detector Selection** - -.. code-block:: python - - class Behaviors( __.immut.DataclassObject ): - ''' Configuration for detector registry usage. ''' - - charset_detectors_order: __.typx.Annotated[ - __.cabc.Sequence[ str ], - __.ddoc.Doc( ''' Order in which charset detectors are applied. ''' ), - ] = ( 'chardet', 'charset-normalizer' ) - - mimetype_detectors_order: __.typx.Annotated[ - __.cabc.Sequence[ str ], - __.ddoc.Doc( ''' Order in which MIME type detectors are applied. ''' ), - ] = ( 'magic', 'puremagic' ) - -**Configuration Design Features:** -- User-configurable detector precedence through sequence ordering -- Default ordering based on library reliability and performance characteristics -- Runtime modification support for dynamic behavior adjustment -- Validation ensures only registered detectors attempted - -Multiple Backend Support -=============================================================================== - -Charset Detection Backends -------------------------------------------------------------------------------- - -**Supported Charset Libraries** - -.. code-block:: python - - # Standard charset detection backends - charset_detectors[ 'chardet' ] # Statistical analysis, UTF-8 bias - charset_detectors[ 'charset-normalizer' ] # Enhanced heuristics, multiple algorithms - -**Backend Characteristics:** -- `chardet`: Mature statistical analysis with proven UTF-8 bias handling -- `charset-normalizer`: Enhanced detection algorithms with multiple confidence scoring - -**Registration Strategy:** -- Both libraries registered with graceful ImportError handling -- Default ordering prioritizes `chardet` for proven reliability -- User configuration enables alternative precedence based on use case requirements - -MIME Type Detection Backends -------------------------------------------------------------------------------- - -**Supported MIME Type Libraries** - -.. code-block:: python - - # MIME type detection backends - mimetype_detectors[ 'magic' ] # python-magic (libmagic bindings) - mimetype_detectors[ 'puremagic' ] # Pure Python magic byte detection - -**Backend Selection Strategy:** -- `python-magic`: Comprehensive magic byte database via libmagic -- `puremagic`: Pure Python implementation for deployment simplicity -- Fallback ordering ensures detection capability across diverse environments - -**Detection Priority Logic:** -- Primary detection via content analysis (magic bytes) -- Secondary detection via filename extension analysis -- Default MIME type assignment based on available context - -Interface Contract Design -=============================================================================== - -Detector Function Contracts -------------------------------------------------------------------------------- - -**Standardized Parameters** - -.. code-block:: python - - def detector_function( - content: Content, # Raw byte content for analysis - behaviors: Behaviors # Configuration object with detection preferences - ) -> DetectionResult | __.types.NotImplementedType: - ''' Standard detector function signature. ''' - -**Return Value Specifications:** -- Successful detection returns structured result with confidence scoring -- Missing dependencies indicated by `NotImplemented` return value -- Exception raising reserved for genuine detection failures -- Result types provide consistent interface across all detection backends - -**Parameter Design Principles:** -- Wide parameter acceptance for maximum backend flexibility -- Behavior-driven configuration enables detector-specific optimization -- Content parameter accepts any bytes-like input for broad compatibility - -Result Type Integration -------------------------------------------------------------------------------- - -**Registry Return Value Contracts:** -- Successful detection returns `CharsetResult` or `MimetypeResult` (defined in API design) -- Missing dependencies indicated by `NotImplemented` return value -- Exception raising reserved for genuine detection failures -- Confidence scoring enables quality-based selection among multiple results - -Registry Architecture Summary -=============================================================================== - -**Key Design Features:** -- Pluggable backend system with standardized detector function signatures -- Graceful degradation through `NotImplemented` return protocol -- User-configurable detector precedence via `Behaviors` configuration -- Support for multiple optional dependencies per detection type - -**Implementation Architecture:** -- Registry containers in `detectors.py` module -- Type aliases for detector function signatures -- Dynamic registration with import-time discovery -- Registry-based dispatch in core detection functions \ No newline at end of file diff --git a/documentation/architecture/designs/003-default-return-behavior.rst b/documentation/architecture/designs/003-default-return-behavior.rst deleted file mode 100644 index 5bff658..0000000 --- a/documentation/architecture/designs/003-default-return-behavior.rst +++ /dev/null @@ -1,385 +0,0 @@ -.. vim: set fileencoding=utf-8: -.. -*- coding: utf-8 -*- -.. +--------------------------------------------------------------------------+ - | | - | Licensed under the Apache License, Version 2.0 (the "License"); | - | you may not use this file except in compliance with the License. | - | You may obtain a copy of the License at | - | | - | http://www.apache.org/licenses/LICENSE-2.0 | - | | - | Unless required by applicable law or agreed to in writing, software | - | distributed under the License is distributed on an "AS IS" BASIS, | - | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | - | See the License for the specific language governing permissions and | - | limitations under the License. | - | | - | +--------------------------------------------------------------------------+ - - -******************************************************************************* -003. Default Return Behavior Specification -******************************************************************************* - -Overview -=============================================================================== - -This document specifies configurable failure handling through default value -returns as an alternative to exception-based error handling. The design -enables graceful degradation for detection failures while maintaining -backward compatibility. - -The pattern addresses performance-critical scenarios, defensive programming -patterns, and fallback value workflows where detection failures are expected -and should not interrupt processing flows. - -Core Design Principles -=============================================================================== - -Configurable Failure Strategy -------------------------------------------------------------------------------- - -**DetectFailureActions Enum Specification** - -.. code-block:: python - - class DetectFailureActions( __.enum.Enum ): - ''' Possible responses to detection failure. ''' - - Default = __.enum.auto( ) - Error = __.enum.auto( ) - -**Failure Action Semantics:** - -- **Default**: Return configurable default value with zero confidence -- **Error**: Raise appropriate exception (preserves backward compatibility) - -**Configuration Integration** - -The failure handling strategy integrates with the ``Behaviors`` -configuration pattern: - -.. code-block:: python - - class Behaviors( __.immut.DataclassObject ): - ''' How functions behave. ''' - - charset_on_detect_failure: __.typx.Annotated[ - DetectFailureActions, - __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ), - ] = DetectFailureActions.Default - - mimetype_on_detect_failure: __.typx.Annotated[ - DetectFailureActions, - __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ), - ] = DetectFailureActions.Default - -Default Value Management -=============================================================================== - -System-Wide Default Constants -------------------------------------------------------------------------------- - -**Module-Level Constants:** - -.. code-block:: python - - CHARSET_DEFAULT: str = 'utf-8' - MIMETYPE_DEFAULT: str = 'application/octet-stream' - -**Default Value Parameters:** - -All detection functions accept optional ``default`` parameters with appropriate -module-level constants as defaults. - -**Confidence Scoring for Default Returns:** - -When returning default values due to detection failure: - -- **Confidence Score**: Always ``0.0`` to indicate detection failure -- **Clear Distinction**: Enables differentiation between successful low-confidence detection and failure fallback -- **Programmatic Detection**: Applications can check ``result.confidence == 0.0`` to identify fallback scenarios - -Core Behavior Specification -=============================================================================== - -**Failure Mode Selection:** - -- **Default Mode**: Return ``default`` parameter value with zero confidence on detection failure -- **Error Mode**: Raise appropriate exception on detection failure (preserves compatibility) - -**Multi-Detection Handling:** - -- **Independent Failure Actions**: Each detection type uses its own failure action configuration -- **Separate Default Values**: ``charset_default`` and ``mimetype_default`` parameters -- **Granular Control**: Mixed failure modes supported (e.g., charset defaults, mimetype errors) - -Usage Patterns and Integration -=============================================================================== - -Performance-Critical Workflows -------------------------------------------------------------------------------- - -**Batch Processing Configuration:** - -.. code-block:: python - - # Configure for maximum performance with graceful degradation - performance_behaviors = Behaviors( - charset_on_detect_failure = DetectFailureActions.Default, - mimetype_on_detect_failure = DetectFailureActions.Default, - trial_decode = BehaviorTristate.Never, - text_validate = BehaviorTristate.Never, - ) - - for content_item in large_content_batch: - result = detect_charset_confidence( - content_item, - behaviors = performance_behaviors, - default = 'utf-8' # Project-specific default - ) - if result.confidence > 0.0: - # Use detected charset - charset = result.charset - else: - # Handle graceful fallback - charset = result.charset # Project default - -**Zero-Exception Processing:** - -Eliminates exception handling overhead for expected failure scenarios: - -.. code-block:: python - - def process_content_batch( contents: list[ bytes ] ) -> list[ str ]: - ''' Processes content batch without exception handling. ''' - texts = [ ] - for content in contents: - charset_result = detect_charset_confidence( content ) - if charset_result.confidence > 0.0: - # High-confidence detection - text = content.decode( charset_result.charset ) - else: - # Fallback to default encoding - text = content.decode( charset_result.charset, errors = 'replace' ) - texts.append( text ) - return texts - -Defensive Programming Patterns -------------------------------------------------------------------------------- - -**Robust Content Processing:** - -.. code-block:: python - - def safe_text_extraction( content: bytes ) -> str: - ''' Extracts text with multiple fallback layers. ''' - charset_result = detect_charset_confidence( content ) - - # Layer 1: High-confidence detection - if charset_result.confidence > 0.8: - try: return content.decode( charset_result.charset ) - except UnicodeDecodeError: pass - - # Layer 2: Medium-confidence with error handling - if charset_result.confidence > 0.3: - try: return content.decode( charset_result.charset, errors = 'replace' ) - except UnicodeDecodeError: pass - - # Layer 3: Fallback to system default - return content.decode( charset_result.charset, errors = 'ignore' ) - -**Mixed Error Handling:** - -.. code-block:: python - - # Strict validation for charset, graceful for MIME type - mixed_behaviors = Behaviors( - charset_on_detect_failure = DetectFailureActions.Error, - mimetype_on_detect_failure = DetectFailureActions.Default, - ) - -Security-Conscious Integration -------------------------------------------------------------------------------- - -**Validation-First Configuration:** - -.. code-block:: python - - # Security-focused configuration with exception-based error handling - security_behaviors = Behaviors( - charset_on_detect_failure = DetectFailureActions.Error, - mimetype_on_detect_failure = DetectFailureActions.Error, - trial_decode = BehaviorTristate.Always, - text_validate = BehaviorTristate.Always, - ) - - try: - result = detect_charset_confidence( - untrusted_content, - behaviors = security_behaviors - ) - # Proceed only with successful detection - validated_text = process_with_charset( result.charset ) - except CharsetDetectFailure: - # Handle detection failure as security concern - reject_untrusted_content( ) - -Implementation Integration Points -=============================================================================== - -Detector Registry Integration -------------------------------------------------------------------------------- - -**Registry Failure Handling:** - -The default return behavior integrates with the detector registry architecture: - -.. code-block:: python - - # Registry iteration with failure handling - for detector_name in behaviors.charset_detectors_order: - detector = charset_detectors.get( detector_name ) - if detector is None: continue - result = detector( content, behaviors ) - if result is NotImplemented: continue - return result - - # No detectors succeeded - apply failure action - match behaviors.charset_on_detect_failure: - case DetectFailureActions.Default: - return CharsetResult( charset = default, confidence = 0.0 ) - case DetectFailureActions.Error: - raise CharsetDetectFailure( location = location ) - -**Optional Dependency Graceful Degradation:** - -When preferred detectors are unavailable, the system gracefully falls back: - -.. code-block:: python - - def _detect_via_chardet( content: Content, behaviors: Behaviors ) -> CharsetResult | NotImplementedType: - try: import chardet - except ImportError: return NotImplemented - # ... detection logic - - # Registry automatically handles NotImplemented returns - # Falls back to next detector or applies failure action - -Confidence-Based Decision Making -------------------------------------------------------------------------------- - -**Confidence Threshold Integration:** - -Default return behavior works with existing confidence-based logic: - -.. code-block:: python - - # AsNeeded behavior respects confidence scoring - charset_result = detect_charset_confidence( content ) - - if charset_result.confidence >= behaviors.trial_decode_confidence: - # Skip expensive trial decode for high-confidence results - return charset_result - elif charset_result.confidence == 0.0: - # Handle failure case explicitly - return fallback_charset_detection( content ) - else: - # Perform trial decode for medium-confidence results - return trial_decode_validation( content, charset_result ) - -Backward Compatibility Guarantees -=============================================================================== - -API Compatibility -------------------------------------------------------------------------------- - -**Signature Preservation:** - -- All existing function signatures remain valid -- New ``default`` parameters have appropriate defaults -- Existing code continues working without modification - -**Behavioral Preservation:** - -- Default configuration preserves exception-based error handling for simple functions -- Confidence functions default to graceful degradation pattern -- No breaking changes to existing exception types or messages - -**Migration Path:** - -.. code-block:: python - - # v1.x/v2.0 existing code (continues working) - try: - charset = detect_charset( content ) - except CharsetDetectFailure: - charset = 'utf-8' # Manual fallback - - # Enhanced v2.x approach (optional migration) - behaviors = Behaviors( charset_on_detect_failure = DetectFailureActions.Default ) - charset = detect_charset( content, behaviors = behaviors, default = 'utf-8' ) - # No exception handling needed - -Configuration Evolution -------------------------------------------------------------------------------- - -**Behaviors Dataclass Compatibility:** - -- New fields added with backward-compatible defaults -- Existing ``Behaviors`` instances continue working -- Incremental adoption of new failure handling features - -**Exception Hierarchy Preservation:** - -- All existing exception classes maintained -- Exception chaining and context preservation unchanged -- Error messages and exception attributes consistent - -Type Safety and Documentation -=============================================================================== - -Type Annotation Patterns -------------------------------------------------------------------------------- - -**Confidence Score Interpretation:** - -.. code-block:: python - - def interpret_charset_result( result: CharsetResult ) -> str: - ''' Interprets charset result with confidence awareness. ''' - if result.confidence == 0.0: - # Detection failed - using fallback value - logger.warning( f"Charset detection failed, using fallback: {result.charset}" ) - elif result.confidence < 0.5: - # Low confidence detection - logger.info( f"Low-confidence charset detection: {result.charset}" ) - # Normal high-confidence processing - return result.charset - -**Default Parameter Type Safety:** - -All ``default`` parameters are properly typed as ``str`` with appropriate -module-level constants as defaults, ensuring type safety and consistency. - -Documentation Patterns -------------------------------------------------------------------------------- - -**Function Documentation Standards:** - -All function docstrings include failure behavior documentation: - -.. code-block:: python - - def detect_charset_confidence( ... ) -> CharsetResult: - ''' Detects character encoding with confidence scoring. - - When configured for default return behavior, returns default - value with zero confidence on detection failure rather than - raising CharsetDetectFailure. Confidence of 0.0 indicates - detection failure with fallback value. - ''' - -**Configuration Documentation:** - -``Behaviors`` fields include comprehensive documentation of failure handling semantics and integration with other configuration options. \ No newline at end of file diff --git a/documentation/architecture/designs/004-trial-codecs-usage-patterns.rst b/documentation/architecture/designs/004-trial-codecs-usage-patterns.rst deleted file mode 100644 index ca8be9e..0000000 --- a/documentation/architecture/designs/004-trial-codecs-usage-patterns.rst +++ /dev/null @@ -1,89 +0,0 @@ -.. vim: set fileencoding=utf-8: -.. -*- coding: utf-8 -*- - -******************************************************************************* -Trial Codecs Usage Patterns -******************************************************************************* - -Context -=============================================================================== - -The ``trial_codecs`` behavior parameter controls which character sets are tried -during decoding operations. Analysis revealed three distinct usage patterns -with different requirements, leading to platform-specific failures when the -same codec order was used for all contexts. - -Usage Patterns -=============================================================================== - -Opportunistic Decoding -------------------------------------------------------------------------------- - -**Goal**: Find any charset that produces readable text from content. - -**Context**: The ``decode()`` function and general content decoding. - -**Strategy**: Try multiple codecs including OS default until one succeeds. - -**Codecs**: ``(OsDefault, UserSupplement, FromInference)`` - -**Rationale**: On modern systems (Linux/Mac), OsDefault is UTF-8, providing a -good first guess that corrects common chardet misdetections. - -Authoritative Validation -------------------------------------------------------------------------------- - -**Goal**: Verify that a specific authoritative charset works (no fallbacks). - -**Context**: HTTP ``Content-Type`` headers, MIME type charset validation. - -**Strategy**: Only try the explicitly specified charset. - -**Codecs**: ``(FromInference,)`` - -**Rationale**: When a charset is authoritatively specified (e.g., HTTP header), -we must test that exact charset, not find alternatives. OS default fallbacks -would mask validation failures. - -Detection Confirmation -------------------------------------------------------------------------------- - -**Goal**: Validate detected charset with optional user hint as fallback. - -**Context**: Charset detection confirmation in ``_confirm_charset_detection()``. - -**Strategy**: Try detected charset, then user supplement if detection fails. - -**Codecs**: ``(UserSupplement, FromInference)`` - -**Rationale**: Validates the detection result but respects user knowledge as -a fallback. Excludes OS default to prevent Windows cp1252 from masking -detection failures. - -Implementation -=============================================================================== - -Each context overrides ``trial_codecs`` via ``__.dcls.replace()`` before -calling codec trial functions: - -.. code-block:: python - - # Authoritative validation - behaviors_strict = __.dcls.replace( - behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) - - # Detection confirmation - behaviors_no_os = __.dcls.replace( - behaviors, - trial_codecs = ( _CodecSpecifiers.UserSupplement, - _CodecSpecifiers.FromInference ) ) - -Platform Considerations -=============================================================================== - -**Windows Issue**: OS default charset is cp1252, an 8-bit encoding that -decodes any byte sequence. When used in validation contexts, it masks -detection failures by succeeding when it shouldn't. - -**Solution**: Exclude ``OsDefault`` from validation and confirmation contexts, -using it only for opportunistic decoding where fallbacks are desired. diff --git a/documentation/architecture/designs/index.rst b/documentation/architecture/designs/index.rst index b48bc53..fcb5dfa 100644 --- a/documentation/architecture/designs/index.rst +++ b/documentation/architecture/designs/index.rst @@ -28,8 +28,4 @@ Each design documents Python-specific architecture, interface contracts, module :maxdepth: 2 :glob: - 001-python-api - 002-detector-registry - 003-default-return-behavior - 004-trial-codecs-usage-patterns ../openspec/specs/*/design diff --git a/documentation/architecture/openspec/specs/api/design.md b/documentation/architecture/openspec/specs/api/design.md new file mode 100644 index 0000000..1e9dae5 --- /dev/null +++ b/documentation/architecture/openspec/specs/api/design.md @@ -0,0 +1,988 @@ +# API Design + +## 001. Python API Specification + +### Overview + +This document specifies the Python API implementing context-aware +text detection with pluggable backend support, confidence-based detection, +and optional dependency architecture. + +The design follows established project practices for interface contracts, +module organization, naming conventions, and provides both simple string-based +APIs and confidence-aware APIs with structured result types. + +### Public Interface Specification + +#### Core Type Definitions + +**Confidence-Based Result Types** + +```python +class CharsetResult( __.immut.DataclassObject ): + ''' Character set encoding with detection confidence. ''' + + charset: __.typx.Annotated[ + __.typx.Optional[ str ], + __.ddoc.Doc( ''' Detected character set encoding. May be None. ''' ), + ] + confidence: __.typx.Annotated[ + float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) + ] + +class MimetypeResult( __.immut.DataclassObject ): + ''' MIME type with detection confidence. ''' + + mimetype: __.typx.Annotated[ + str, __.ddoc.Doc( ''' Detected MIME type. ''' ) + ] + confidence: __.typx.Annotated[ + float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) + ] +``` + +**Configuration Types** + +```python +class BehaviorTristate( __.enum.Enum ): + ''' When to apply behavior. ''' + + Never = __.enum.auto( ) + AsNeeded = __.enum.auto( ) + Always = __.enum.auto( ) + +class DetectFailureActions( __.enum.Enum ): + ''' Possible responses to detection failure. ''' + + Default = __.enum.auto( ) + Error = __.enum.auto( ) + +class CodecSpecifiers( __.enum.Enum ): + ''' Specifiers for dynamic codecs. ''' + + FromInference = __.enum.auto( ) + OsDefault = __.enum.auto( ) + PythonDefault = __.enum.auto( ) + UserSupplement = __.enum.auto( ) + +class Behaviors( __.immut.DataclassObject ): + ''' How functions behave. ''' + + charset_detectors_order: __.typx.Annotated[ + __.cabc.Sequence[ str ], + __.ddoc.Doc( ''' Order in which charset detectors are applied. ''' ), + ] = ( 'chardet', 'charset-normalizer' ) + + charset_on_detect_failure: __.typx.Annotated[ + DetectFailureActions, + __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ), + ] = DetectFailureActions.Default + + mimetype_detectors_order: __.typx.Annotated[ + __.cabc.Sequence[ str ], + __.ddoc.Doc( ''' Order in which MIME type detectors are applied. ''' ), + ] = ( 'magic', 'puremagic' ) + + mimetype_on_detect_failure: __.typx.Annotated[ + DetectFailureActions, + __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ), + ] = DetectFailureActions.Default + + charset_detect: __.typx.Annotated[ + BehaviorTristate, + __.ddoc.Doc( ''' When to detect charset from content. ''' ), + ] = BehaviorTristate.AsNeeded + + mimetype_detect: __.typx.Annotated[ + BehaviorTristate, + __.ddoc.Doc( ''' When to detect MIME type from content. ''' ), + ] = BehaviorTristate.AsNeeded +``` + +#### Simple String-Based Detection Functions + +**Character Encoding Detection** + +```python +def detect_charset( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + default: str = CHARSET_DEFAULT, + supplement: __.Absential[ str ] = __.absent, + mimetype: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, +) -> __.typx.Optional[ str ]: + ''' Detects character encoding. + + Returns the most likely character encoding. When configured for + default return behavior, returns the default value on detection + failure rather than raising an exception. + ''' + +def detect_mimetype( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + default: str = MIMETYPE_DEFAULT, + charset: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, +) -> str: + ''' Detects MIME type. + + Returns the most likely MIME type. When configured for default + return behavior, returns the default value on detection failure + rather than raising an exception. + ''' +``` + +**Inference Functions with Context Support** + +```python +def infer_charset( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + charset_default: str = CHARSET_DEFAULT, + http_content_type: __.Absential[ str ] = __.absent, + charset_supplement: __.Absential[ str ] = __.absent, + mimetype_supplement: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, +) -> __.typx.Optional[ str ]: + ''' Infers charset through various means. + + Utilizes HTTP Content-Type headers, location hints, and content + analysis for contextual charset inference. Supports configurable + default return behavior on inference failure. + ''' + +def infer_mimetype_charset( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + charset_default: str = CHARSET_DEFAULT, + mimetype_default: str = MIMETYPE_DEFAULT, + http_content_type: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, + charset_supplement: __.Absential[ str ] = __.absent, + mimetype_supplement: __.Absential[ str ] = __.absent, +) -> tuple[ str, __.typx.Optional[ str ] ]: + ''' Detects MIME type and charset with context support. + + Returns tuple of (mimetype, charset). Provides comprehensive + detection utilizing all available context with configurable + default behavior on detection failure. + ''' +``` + +#### Confidence-Based Detection Functions + +**Core Confidence Functions** + +```python +def detect_charset_confidence( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + default: str = CHARSET_DEFAULT, + supplement: __.Absential[ str ] = __.absent, + mimetype: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, +) -> CharsetResult: + ''' Detects character encoding with confidence scoring. + + Returns CharsetResult with charset and confidence level. When + configured for default return behavior, returns default value + with zero confidence on detection failure. + ''' + +def detect_mimetype_confidence( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + default: str = MIMETYPE_DEFAULT, + charset: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, +) -> MimetypeResult: + ''' Detects MIME type with confidence scoring. + + Returns MimetypeResult with mimetype and confidence level. When + configured for default return behavior, returns default value + with zero confidence on detection failure. + ''' +``` + +**Advanced Confidence Inference** + +```python +def infer_charset_confidence( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + charset_default: str = CHARSET_DEFAULT, + http_content_type: __.Absential[ str ] = __.absent, + charset_supplement: __.Absential[ str ] = __.absent, + mimetype_supplement: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, +) -> CharsetResult: + ''' Infers charset with confidence through various means. + + Utilizes contextual information for enhanced detection quality. + Supports configurable default return behavior on inference failure. + ''' + +def infer_mimetype_charset_confidence( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + charset_default: str = CHARSET_DEFAULT, + mimetype_default: str = MIMETYPE_DEFAULT, + http_content_type: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, + charset_supplement: __.Absential[ str ] = __.absent, + mimetype_supplement: __.Absential[ str ] = __.absent, +) -> tuple[ MimetypeResult, CharsetResult ]: + ''' Detects MIME type and charset with confidence scoring. + + Returns tuple of (MimetypeResult, CharsetResult) with full + confidence information for both detection results. Supports + configurable default behavior on detection failure. + ''' +``` + +**Confidence Utility Functions** + +```python +def confidence_from_bytes_quantity( + content: Content, + behaviors: Behaviors = BEHAVIORS_DEFAULT +) -> float: + ''' Calculates confidence score based on content length. + + Returns confidence value from 0.0 to 1.0 based on the amount + of content available for analysis. + ''' +``` + +#### High-Level Decoding and Validation + +**Content Decoding** + +```python +def decode( + content: Content, /, *, + behaviors: Behaviors = BEHAVIORS_DEFAULT, + profile: TextValidationProfile = PROFILE_TEXTUAL, + charset_default: str = CHARSET_DEFAULT, + mimetype_default: str = MIMETYPE_DEFAULT, + http_content_type: __.Absential[ str ] = __.absent, + location: __.Absential[ Location ] = __.absent, + charset_supplement: __.Absential[ str ] = __.absent, + mimetype_supplement: __.Absential[ str ] = __.absent, +) -> str: + ''' High-level bytes-to-text decoding with validation. + + Performs comprehensive detection, decoding, and validation + for robust text extraction from byte content. Supports + configurable default values for graceful degradation. + ''' +``` + +**Textual Content Validation** + +```python +def is_textual_mimetype( mimetype: str ) -> bool: + ''' Validates if MIME type represents textual content. + + Returns True for MIME types representing textual content. + ''' + +def is_valid_text( + text: str, + profile: TextValidationProfile = PROFILE_TEXTUAL +) -> bool: + ''' Unicode-aware text validation with configurable profiles. + + Returns True for text meeting the specified validation profile. + ''' +``` + +#### Line Separator Processing + +**LineSeparators Enum** (unchanged from v1.x specification) + +```python +class LineSeparators( __.enum.Enum ): + ''' Line separators for cross-platform text processing. ''' + + CR = '\r' # Classic MacOS (0xD) + CRLF = '\r\n' # DOS/Windows (0xD 0xA) + LF = '\n' # Unix/Linux (0xA) + + @classmethod + def detect_bytes( + selfclass, + content: __.cabc.Sequence[ int ] | bytes, + limit: int = 1024 + ) -> __.typx.Optional[ 'LineSeparators' ]: + ''' Detects line separator from byte content sample. ''' + + @classmethod + def normalize_universal( selfclass, content: str ) -> str: + ''' Normalizes all line separators to Unix LF format. ''' + + def normalize( self, content: str ) -> str: + ''' Normalizes specific line separator to Unix LF format. ''' + + def nativize( self, content: str ) -> str: + ''' Converts Unix LF to this platform's line separator. ''' +``` + +### Type Annotation Patterns + +**Module Constants:** + +```python +CHARSET_DEFAULT: str = 'utf-8' +MIMETYPE_DEFAULT: str = 'application/octet-stream' +``` + +**Common Type Aliases:** + +```python +Content: __.typx.TypeAlias = __.typx.Annotated[ + bytes, + __.ddoc.Doc( "Raw byte content for analysis." ) +] + +Location: __.typx.TypeAlias = __.typx.Annotated[ + str | __.pathlib.Path, + __.ddoc.Doc( "File path or URL for detection context." ) +] +``` + +**Absential Pattern for Context Parameters:** +\- Distinguish "not provided" (absent) from "explicitly None" +\- Enable three-state parameters: absent | None | value +\- Support complex context handling for HTTP headers and supplements + +**Return Type Patterns:** +\- Simple APIs return `str` or `__.typx.Optional[ str ]` +\- Confidence APIs return structured types: `CharsetResult`, `MimetypeResult` +\- Combined APIs return immutable tuples: `tuple[ MimetypeResult, CharsetResult ]` +\- Default return behavior: confidence = 0.0 indicates detection failure with fallback value + +**Default Return Behavior Pattern:** +\- `DetectFailureActions.Default`: Return default value with zero confidence +\- `DetectFailureActions.Error`: Raise appropriate exception (legacy behavior) +\- All detection functions accept `default` parameters for graceful degradation + +### Exception Hierarchy Design + +#### Following Omnierror Pattern + +```python +class Omniexception( + __.immut.Object, BaseException, + instances_visibles = ( + '__cause__', '__context__', __.is_public_identifier ), +): + ''' Base for all exceptions raised by package API. ''' + +class Omnierror( Omniexception, Exception ): + ''' Base for error exceptions raised by package API. ''' + +# Detection-specific exceptions +class CharsetDetectFailure( Omnierror, TypeError, ValueError ): + ''' Raised when character encoding detection fails. ''' + +class CharsetInferFailure( Omnierror, TypeError, ValueError ): + ''' Raised when character encoding inference fails. ''' + +class MimetypeDetectFailure( Omnierror, TypeError, ValueError ): + ''' Raised when MIME type detection fails. ''' + +class ContentDecodeFailure( Omnierror, UnicodeError ): + ''' Raised when content cannot be decoded with detected charset. ''' +``` + +**Exception Design Principles:** +\- Follow nomenclature patterns: `Failure` +\- Inherit from appropriate built-in exception types +\- Support location context in error messages +\- Enable package-wide exception catching via `Omnierror` + +### Implementation Considerations + +#### Context-Aware Detection Strategy + +**Detection Priority Order:** +1\. HTTP Content-Type headers (when available) +2\. Location/filename extension analysis +3\. Magic bytes content analysis +4\. Fallback to defaults based on available information + +**Registry-Based Backend Selection:** +\- Configurable detector precedence via `Behaviors` +\- Dynamic fallback when detectors return `NotImplemented` +\- Support for multiple optional dependencies per detection type + +**Confidence Integration:** +\- Length-based confidence calculation +\- Backend-specific confidence scoring +\- AsNeeded behavior triggering based on confidence thresholds + +**Performance Characteristics:** +\- Lazy evaluation of detection operations +\- Sample-based analysis for large content +\- Minimal abstraction preserving detector performance + + + +## 002. Detector Registry Specification + +### Overview + +This document specifies the detector registry architecture for pluggable +backend support in the detextive library. The registry system enables +configurable detector precedence, graceful degradation with optional +dependencies, and dynamic fallback strategies for robust detection across +diverse environments. + +The design follows established project practices for type aliases, interface +contracts, and module organization while providing extensibility for +third-party detection backends. + +### Registry Architecture + +#### Core Registry Types + +**Detector Function Signatures** + +```python +CharsetDetector: __.typx.TypeAlias = __.cabc.Callable[ + [ Content, Behaviors ], + CharsetResult | __.types.NotImplementedType +] + +MimetypeDetector: __.typx.TypeAlias = __.cabc.Callable[ + [ Content, Behaviors ], + MimetypeResult | __.types.NotImplementedType +] +``` + +**Registry Container Types** + +```python +charset_detectors: __.accret.Dictionary[ str, CharsetDetector ] +mimetype_detectors: __.accret.Dictionary[ str, MimetypeDetector ] +``` + +**Registry Contract Specifications:** +\- Detectors return specific result types with confidence scoring +\- `NotImplemented` return value indicates missing optional dependency +\- Registry keys provide user-configurable detector ordering +\- Detector functions accept standardized parameters for consistent interfaces + +#### Registry Registration Pattern + +**Dynamic Registration System** + +```python +def _detect_via_chardet( + content: Content, behaviors: Behaviors +) -> CharsetResult | __.types.NotImplementedType: + ''' Detects charset using chardet library. ''' + try: + from chardet import detect as _chardet_detect + except ImportError: + return NotImplemented + + # Detection implementation would follow here + +def _detect_via_charset_normalizer( + content: Content, behaviors: Behaviors +) -> CharsetResult | __.types.NotImplementedType: + ''' Detects charset using charset-normalizer library. ''' + try: + from charset_normalizer import from_bytes + except ImportError: + return NotImplemented + + # Detection implementation would follow here + +# Registration at module initialization +charset_detectors[ 'chardet' ] = _detect_via_chardet +charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer +``` + +**Registration Design Principles:** +\- Lazy import strategy with graceful ImportError handling +\- Consistent function signature across all detector implementations +\- Registry key naming matches common library names for intuitive configuration +\- Module-level registration enables import-time detector discovery + +### Optional Dependency Strategy + +#### Graceful Degradation Pattern + +**NotImplemented Return Protocol** + +The registry system implements graceful degradation where: +\- Detectors return `NotImplemented` for missing optional dependencies +\- Registry iteration continues until successful detection +\- Exception raising occurs only when all configured detectors fail +\- User-configurable detector ordering enables fallback preferences + +#### Configuration Integration + +**Behavior-Driven Detector Selection** + +```python +class Behaviors( __.immut.DataclassObject ): + ''' Configuration for detector registry usage. ''' + + charset_detectors_order: __.typx.Annotated[ + __.cabc.Sequence[ str ], + __.ddoc.Doc( ''' Order in which charset detectors are applied. ''' ), + ] = ( 'chardet', 'charset-normalizer' ) + + mimetype_detectors_order: __.typx.Annotated[ + __.cabc.Sequence[ str ], + __.ddoc.Doc( ''' Order in which MIME type detectors are applied. ''' ), + ] = ( 'magic', 'puremagic' ) +``` + +**Configuration Design Features:** +\- User-configurable detector precedence through sequence ordering +\- Default ordering based on library reliability and performance characteristics +\- Runtime modification support for dynamic behavior adjustment +\- Validation ensures only registered detectors attempted + +### Multiple Backend Support + +#### Charset Detection Backends + +**Supported Charset Libraries** + +```python +# Standard charset detection backends +charset_detectors[ 'chardet' ] # Statistical analysis, UTF-8 bias +charset_detectors[ 'charset-normalizer' ] # Enhanced heuristics, multiple algorithms +``` + +**Backend Characteristics:** +\- `chardet`: Mature statistical analysis with proven UTF-8 bias handling +\- `charset-normalizer`: Enhanced detection algorithms with multiple confidence scoring + +**Registration Strategy:** +\- Both libraries registered with graceful ImportError handling +\- Default ordering prioritizes `chardet` for proven reliability +\- User configuration enables alternative precedence based on use case requirements + +#### MIME Type Detection Backends + +**Supported MIME Type Libraries** + +```python +# MIME type detection backends +mimetype_detectors[ 'magic' ] # python-magic (libmagic bindings) +mimetype_detectors[ 'puremagic' ] # Pure Python magic byte detection +``` + +**Backend Selection Strategy:** +\- `python-magic`: Comprehensive magic byte database via libmagic +\- `puremagic`: Pure Python implementation for deployment simplicity +\- Fallback ordering ensures detection capability across diverse environments + +**Detection Priority Logic:** +\- Primary detection via content analysis (magic bytes) +\- Secondary detection via filename extension analysis +\- Default MIME type assignment based on available context + +### Interface Contract Design + +#### Detector Function Contracts + +**Standardized Parameters** + +```python +def detector_function( + content: Content, # Raw byte content for analysis + behaviors: Behaviors # Configuration object with detection preferences +) -> DetectionResult | __.types.NotImplementedType: + ''' Standard detector function signature. ''' +``` + +**Return Value Specifications:** +\- Successful detection returns structured result with confidence scoring +\- Missing dependencies indicated by `NotImplemented` return value +\- Exception raising reserved for genuine detection failures +\- Result types provide consistent interface across all detection backends + +**Parameter Design Principles:** +\- Wide parameter acceptance for maximum backend flexibility +\- Behavior-driven configuration enables detector-specific optimization +\- Content parameter accepts any bytes-like input for broad compatibility + +#### Result Type Integration + +**Registry Return Value Contracts:** +\- Successful detection returns `CharsetResult` or `MimetypeResult` (defined in API design) +\- Missing dependencies indicated by `NotImplemented` return value +\- Exception raising reserved for genuine detection failures +\- Confidence scoring enables quality-based selection among multiple results + +### Registry Architecture Summary + +**Key Design Features:** +\- Pluggable backend system with standardized detector function signatures +\- Graceful degradation through `NotImplemented` return protocol +\- User-configurable detector precedence via `Behaviors` configuration +\- Support for multiple optional dependencies per detection type + +**Implementation Architecture:** +\- Registry containers in `detectors.py` module +\- Type aliases for detector function signatures +\- Dynamic registration with import-time discovery +\- Registry-based dispatch in core detection functions + + + +## 003. Default Return Behavior Specification + +### Overview + +This document specifies configurable failure handling through default value +returns as an alternative to exception-based error handling. The design +enables graceful degradation for detection failures while maintaining +backward compatibility. + +The pattern addresses performance-critical scenarios, defensive programming +patterns, and fallback value workflows where detection failures are expected +and should not interrupt processing flows. + +### Core Design Principles + +#### Configurable Failure Strategy + +**DetectFailureActions Enum Specification** + +```python +class DetectFailureActions( __.enum.Enum ): + ''' Possible responses to detection failure. ''' + + Default = __.enum.auto( ) + Error = __.enum.auto( ) +``` + +**Failure Action Semantics:** + +- **Default**: Return configurable default value with zero confidence +- **Error**: Raise appropriate exception (preserves backward compatibility) + +**Configuration Integration** + +The failure handling strategy integrates with the `Behaviors` +configuration pattern: + +```python +class Behaviors( __.immut.DataclassObject ): + ''' How functions behave. ''' + + charset_on_detect_failure: __.typx.Annotated[ + DetectFailureActions, + __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ), + ] = DetectFailureActions.Default + + mimetype_on_detect_failure: __.typx.Annotated[ + DetectFailureActions, + __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ), + ] = DetectFailureActions.Default +``` + +### Default Value Management + +#### System-Wide Default Constants + +**Module-Level Constants:** + +```python +CHARSET_DEFAULT: str = 'utf-8' +MIMETYPE_DEFAULT: str = 'application/octet-stream' +``` + +**Default Value Parameters:** + +All detection functions accept optional `default` parameters with appropriate +module-level constants as defaults. + +**Confidence Scoring for Default Returns:** + +When returning default values due to detection failure: + +- **Confidence Score**: Always `0.0` to indicate detection failure +- **Clear Distinction**: Enables differentiation between successful low-confidence detection and failure fallback +- **Programmatic Detection**: Applications can check `result.confidence == 0.0` to identify fallback scenarios + +### Core Behavior Specification + +**Failure Mode Selection:** + +- **Default Mode**: Return `default` parameter value with zero confidence on detection failure +- **Error Mode**: Raise appropriate exception on detection failure (preserves compatibility) + +**Multi-Detection Handling:** + +- **Independent Failure Actions**: Each detection type uses its own failure action configuration +- **Separate Default Values**: `charset_default` and `mimetype_default` parameters +- **Granular Control**: Mixed failure modes supported (e.g., charset defaults, mimetype errors) + +### Usage Patterns and Integration + +#### Performance-Critical Workflows + +**Batch Processing Configuration:** + +```python +# Configure for maximum performance with graceful degradation +performance_behaviors = Behaviors( + charset_on_detect_failure = DetectFailureActions.Default, + mimetype_on_detect_failure = DetectFailureActions.Default, + trial_decode = BehaviorTristate.Never, + text_validate = BehaviorTristate.Never, +) + +for content_item in large_content_batch: + result = detect_charset_confidence( + content_item, + behaviors = performance_behaviors, + default = 'utf-8' # Project-specific default + ) + if result.confidence > 0.0: + # Use detected charset + charset = result.charset + else: + # Handle graceful fallback + charset = result.charset # Project default +``` + +**Zero-Exception Processing:** + +Eliminates exception handling overhead for expected failure scenarios: + +```python +def process_content_batch( contents: list[ bytes ] ) -> list[ str ]: + ''' Processes content batch without exception handling. ''' + texts = [ ] + for content in contents: + charset_result = detect_charset_confidence( content ) + if charset_result.confidence > 0.0: + # High-confidence detection + text = content.decode( charset_result.charset ) + else: + # Fallback to default encoding + text = content.decode( charset_result.charset, errors = 'replace' ) + texts.append( text ) + return texts +``` + +#### Defensive Programming Patterns + +**Robust Content Processing:** + +```python +def safe_text_extraction( content: bytes ) -> str: + ''' Extracts text with multiple fallback layers. ''' + charset_result = detect_charset_confidence( content ) + + # Layer 1: High-confidence detection + if charset_result.confidence > 0.8: + try: return content.decode( charset_result.charset ) + except UnicodeDecodeError: pass + + # Layer 2: Medium-confidence with error handling + if charset_result.confidence > 0.3: + try: return content.decode( charset_result.charset, errors = 'replace' ) + except UnicodeDecodeError: pass + + # Layer 3: Fallback to system default + return content.decode( charset_result.charset, errors = 'ignore' ) +``` + +**Mixed Error Handling:** + +```python +# Strict validation for charset, graceful for MIME type +mixed_behaviors = Behaviors( + charset_on_detect_failure = DetectFailureActions.Error, + mimetype_on_detect_failure = DetectFailureActions.Default, +) +``` + +#### Security-Conscious Integration + +**Validation-First Configuration:** + +```python +# Security-focused configuration with exception-based error handling +security_behaviors = Behaviors( + charset_on_detect_failure = DetectFailureActions.Error, + mimetype_on_detect_failure = DetectFailureActions.Error, + trial_decode = BehaviorTristate.Always, + text_validate = BehaviorTristate.Always, +) + +try: + result = detect_charset_confidence( + untrusted_content, + behaviors = security_behaviors + ) + # Proceed only with successful detection + validated_text = process_with_charset( result.charset ) +except CharsetDetectFailure: + # Handle detection failure as security concern + reject_untrusted_content( ) +``` + +### Implementation Integration Points + +#### Detector Registry Integration + +**Registry Failure Handling:** + +The default return behavior integrates with the detector registry architecture: + +```python +# Registry iteration with failure handling +for detector_name in behaviors.charset_detectors_order: + detector = charset_detectors.get( detector_name ) + if detector is None: continue + result = detector( content, behaviors ) + if result is NotImplemented: continue + return result + +# No detectors succeeded - apply failure action +match behaviors.charset_on_detect_failure: + case DetectFailureActions.Default: + return CharsetResult( charset = default, confidence = 0.0 ) + case DetectFailureActions.Error: + raise CharsetDetectFailure( location = location ) +``` + +**Optional Dependency Graceful Degradation:** + +When preferred detectors are unavailable, the system gracefully falls back: + +```python +def _detect_via_chardet( content: Content, behaviors: Behaviors ) -> CharsetResult | NotImplementedType: + try: import chardet + except ImportError: return NotImplemented + # ... detection logic + +# Registry automatically handles NotImplemented returns +# Falls back to next detector or applies failure action +``` + +#### Confidence-Based Decision Making + +**Confidence Threshold Integration:** + +Default return behavior works with existing confidence-based logic: + +```python +# AsNeeded behavior respects confidence scoring +charset_result = detect_charset_confidence( content ) + +if charset_result.confidence >= behaviors.trial_decode_confidence: + # Skip expensive trial decode for high-confidence results + return charset_result +elif charset_result.confidence == 0.0: + # Handle failure case explicitly + return fallback_charset_detection( content ) +else: + # Perform trial decode for medium-confidence results + return trial_decode_validation( content, charset_result ) +``` + +### Backward Compatibility Guarantees + +#### API Compatibility + +**Signature Preservation:** + +- All existing function signatures remain valid +- New `default` parameters have appropriate defaults +- Existing code continues working without modification + +**Behavioral Preservation:** + +- Default configuration preserves exception-based error handling for simple functions +- Confidence functions default to graceful degradation pattern +- No breaking changes to existing exception types or messages + +**Migration Path:** + +```python +# v1.x/v2.0 existing code (continues working) +try: + charset = detect_charset( content ) +except CharsetDetectFailure: + charset = 'utf-8' # Manual fallback + +# Enhanced v2.x approach (optional migration) +behaviors = Behaviors( charset_on_detect_failure = DetectFailureActions.Default ) +charset = detect_charset( content, behaviors = behaviors, default = 'utf-8' ) +# No exception handling needed +``` + +#### Configuration Evolution + +**Behaviors Dataclass Compatibility:** + +- New fields added with backward-compatible defaults +- Existing `Behaviors` instances continue working +- Incremental adoption of new failure handling features + +**Exception Hierarchy Preservation:** + +- All existing exception classes maintained +- Exception chaining and context preservation unchanged +- Error messages and exception attributes consistent + +### Type Safety and Documentation + +#### Type Annotation Patterns + +**Confidence Score Interpretation:** + +```python +def interpret_charset_result( result: CharsetResult ) -> str: + ''' Interprets charset result with confidence awareness. ''' + if result.confidence == 0.0: + # Detection failed - using fallback value + logger.warning( f"Charset detection failed, using fallback: {result.charset}" ) + elif result.confidence < 0.5: + # Low confidence detection + logger.info( f"Low-confidence charset detection: {result.charset}" ) + # Normal high-confidence processing + return result.charset +``` + +**Default Parameter Type Safety:** + +All `default` parameters are properly typed as `str` with appropriate +module-level constants as defaults, ensuring type safety and consistency. + +#### Documentation Patterns + +**Function Documentation Standards:** + +All function docstrings include failure behavior documentation: + +```python +def detect_charset_confidence( ... ) -> CharsetResult: + ''' Detects character encoding with confidence scoring. + + When configured for default return behavior, returns default + value with zero confidence on detection failure rather than + raising CharsetDetectFailure. Confidence of 0.0 indicates + detection failure with fallback value. + ''' +``` + +**Configuration Documentation:** + +`Behaviors` fields include comprehensive documentation of failure handling semantics and integration with other configuration options. diff --git a/documentation/architecture/openspec/specs/api/spec.md b/documentation/architecture/openspec/specs/api/spec.md new file mode 100644 index 0000000..31b0684 --- /dev/null +++ b/documentation/architecture/openspec/specs/api/spec.md @@ -0,0 +1,33 @@ +# API + +## Purpose +The API capability provides a consistent and configurable interface for accessing detection and validation functionalities. It ensures standardized error handling, return types, and extensibility through a detector registry. + +## Requirements + +### Requirement: Unified Interface +The system SHALL provide a unified interface for detection functions (charset, mimetype) using common behavior configuration objects. + +Priority: High + +#### Scenario: Use common configuration +- **WHEN** calling detection functions +- **THEN** they accept a common behavior object + +### Requirement: Configurable Behaviors +The system SHALL allow users to configure behaviors such as failure handling (error vs default value) and validation strictness. + +Priority: High + +#### Scenario: Configure failure handling +- **WHEN** behavior is configured to return default on failure +- **THEN** no exception is raised when detection fails + +### Requirement: Extensibility +The system SHALL support adding new detectors via a registry mechanism without modifying core code. + +Priority: Medium + +#### Scenario: Register new detector +- **WHEN** a new detector is registered +- **THEN** it is used in subsequent detection calls diff --git a/documentation/architecture/openspec/specs/charset-detection/design.md b/documentation/architecture/openspec/specs/charset-detection/design.md new file mode 100644 index 0000000..d3e6737 --- /dev/null +++ b/documentation/architecture/openspec/specs/charset-detection/design.md @@ -0,0 +1,82 @@ +# Charset Detection Design + +## Trial Codecs Usage Patterns + +### Context + +The `trial_codecs` behavior parameter controls which character sets are tried +during decoding operations. Analysis revealed three distinct usage patterns +with different requirements, leading to platform-specific failures when the +same codec order was used for all contexts. + +### Usage Patterns + +#### Opportunistic Decoding + +**Goal**: Find any charset that produces readable text from content. + +**Context**: The `decode()` function and general content decoding. + +**Strategy**: Try multiple codecs including OS default until one succeeds. + +**Codecs**: `(OsDefault, UserSupplement, FromInference)` + +**Rationale**: On modern systems (Linux/Mac), OsDefault is UTF-8, providing a +good first guess that corrects common chardet misdetections. + +#### Authoritative Validation + +**Goal**: Verify that a specific authoritative charset works (no fallbacks). + +**Context**: HTTP `Content-Type` headers, MIME type charset validation. + +**Strategy**: Only try the explicitly specified charset. + +**Codecs**: `(FromInference,)` + +**Rationale**: When a charset is authoritatively specified (e.g., HTTP header), +we must test that exact charset, not find alternatives. OS default fallbacks +would mask validation failures. + +#### Detection Confirmation + +**Goal**: Validate detected charset with optional user hint as fallback. + +**Context**: Charset detection confirmation in `_confirm_charset_detection()`. + +**Strategy**: Try detected charset, then user supplement if detection fails. + +**Codecs**: `(UserSupplement, FromInference)` + +**Rationale**: Validates the detection result but respects user knowledge as +a fallback. Excludes OS default to prevent Windows cp1252 from masking +detection failures. + +### Implementation + +Each context overrides `trial_codecs` via `__.dcls.replace()` before +calling codec trial functions: + +```python +# Authoritative validation +behaviors_strict = __.dcls.replace( + behaviors, + trial_codecs = ( _CodecSpecifiers.FromInference, ) ) + +# Detection confirmation +behaviors_no_os = __.dcls.replace( + behaviors, + trial_codecs = ( + _CodecSpecifiers.UserSupplement, + _CodecSpecifiers.FromInference, + ) ) +``` + +### Platform Considerations + +**Windows Issue**: OS default charset is cp1252, an 8-bit encoding that +decodes any byte sequence. When used in validation contexts, it masks +detection failures by succeeding when it shouldn't. + +**Solution**: Exclude `OsDefault` from validation and confirmation contexts, +using it only for opportunistic decoding where fallbacks are desired. diff --git a/documentation/architecture/openspec/specs/charset-detection/spec.md b/documentation/architecture/openspec/specs/charset-detection/spec.md new file mode 100644 index 0000000..df6d1f8 --- /dev/null +++ b/documentation/architecture/openspec/specs/charset-detection/spec.md @@ -0,0 +1,44 @@ +# Charset Detection + +## Purpose +This capability detects the character encoding of byte content to ensure it can be properly decoded into text without encoding errors. + +## Requirements + +### Requirement: Auto-Detection +The system SHALL auto-detect character encoding using statistical analysis of the byte content. + +Priority: Critical + +#### Scenario: Detect encoding +- **WHEN** byte content is analyzed +- **THEN** the most likely character encoding is returned +- **AND** a confidence score is provided + +### Requirement: UTF-8 Preference +The system SHALL prefer UTF-8 when ASCII content could be valid as either ASCII or UTF-8, aligning with modern standards. + +Priority: Critical + +#### Scenario: Prefer UTF-8 +- **WHEN** content is valid ASCII +- **THEN** the system reports it as UTF-8 (or compatible subset) if not explicitly distinguished + +### Requirement: Validation +The system SHALL validate detected encodings by attempting decode operations to prevent false positives. + +Priority: Critical + +#### Scenario: Validate by decoding +- **WHEN** a potential encoding is identified +- **THEN** the system attempts to decode the content +- **AND** discards the encoding if decoding fails + +### Requirement: Python Compatibility +The system SHALL return encoding names compatible with Python's codec system. + +Priority: Critical + +#### Scenario: Compatible names +- **WHEN** an encoding is returned +- **THEN** it can be used directly with `bytes.decode()` diff --git a/documentation/architecture/openspec/specs/line-separator-processing/spec.md b/documentation/architecture/openspec/specs/line-separator-processing/spec.md new file mode 100644 index 0000000..d2e95b1 --- /dev/null +++ b/documentation/architecture/openspec/specs/line-separator-processing/spec.md @@ -0,0 +1,33 @@ +# Line Separator Processing + +## Purpose +This capability detects and normalizes line separators to ensure consistent text processing across different platforms (Windows, macOS, Linux). + +## Requirements + +### Requirement: Separator Detection +The system SHALL detect line separator types (CR, LF, CRLF) from byte or text content. + +Priority: Critical + +#### Scenario: Detect CRLF +- **WHEN** content containing `\r\n` is analyzed +- **THEN** the system identifies the separator as CRLF + +### Requirement: Normalization to Unix +The system SHALL normalize line endings to Unix LF (`\n`) format for internal processing consistency. + +Priority: Critical + +#### Scenario: Normalize text +- **WHEN** text with mixed or non-Unix line endings is processed +- **THEN** all line separators are converted to `\n` + +### Requirement: Platform Conversion +The system SHALL support converting line endings to platform-specific formats when needed for output. + +Priority: Critical + +#### Scenario: Convert to Windows +- **WHEN** text needs to be saved for Windows +- **THEN** `\n` characters are converted to `\r\n` diff --git a/documentation/architecture/openspec/specs/mimetype-detection/spec.md b/documentation/architecture/openspec/specs/mimetype-detection/spec.md new file mode 100644 index 0000000..0fdaa78 --- /dev/null +++ b/documentation/architecture/openspec/specs/mimetype-detection/spec.md @@ -0,0 +1,44 @@ +# Mimetype Detection + +## Purpose +This capability enables the detection of MIME types from byte content or file locations. It allows applications to determine appropriate content handling strategies by identifying the format of the data. + +## Requirements + +### Requirement: Content-Based Detection +The system SHALL detect MIME types using content-based analysis (magic bytes) to ensure accurate identification even without file extensions. + +Priority: Critical + +#### Scenario: Detect from bytes +- **WHEN** raw byte content is provided +- **THEN** the system returns the detected MIME type based on magic numbers +- **AND** a confidence score is provided + +### Requirement: Fallback Detection +The system SHALL fall back to file extension-based detection when content detection fails or provides low confidence results. + +Priority: Critical + +#### Scenario: Fallback to extension +- **WHEN** content detection returns indeterminate results +- **AND** a file path is provided +- **THEN** the system determines the MIME type based on the file extension + +### Requirement: Standardized Output +The system SHALL return standardized MIME type strings (e.g., "text/plain", "application/json") to ensure consistency across applications. + +Priority: Critical + +#### Scenario: Standardized format +- **WHEN** a MIME type is detected +- **THEN** it matches the IANA media type registry format + +### Requirement: Textual Type Identification +The system SHALL identify if a MIME type represents textual content to facilitate text processing decisions. + +Priority: High + +#### Scenario: Identify textual types +- **WHEN** a MIME type is checked +- **THEN** the system correctly identifies if it is textual (e.g., "text/html", "application/json") or binary diff --git a/documentation/architecture/openspec/specs/text-validation/spec.md b/documentation/architecture/openspec/specs/text-validation/spec.md new file mode 100644 index 0000000..261ac70 --- /dev/null +++ b/documentation/architecture/openspec/specs/text-validation/spec.md @@ -0,0 +1,24 @@ +# Text Validation + +## Purpose +This capability determines if content represents meaningful text, preventing the processing of binary data as text which could lead to errors or corruption. + +## Requirements + +### Requirement: Heuristic Validation +The system SHALL validate decoded text content using heuristics such as the ratio of printable characters and control characters. + +Priority: High + +#### Scenario: Validate text +- **WHEN** decoded text is analyzed +- **THEN** it is classified as valid text only if it meets configured heuristics (e.g., sufficient printable characters) + +### Requirement: Profile Support +The system SHALL support configurable profiles for textual validation to handle different definitions of "valid text" (e.g., terminal safe, printer safe). + +Priority: High + +#### Scenario: Use profile +- **WHEN** validating text with a specific profile +- **THEN** the validation logic respects the profile's allowed and rejected character sets diff --git a/documentation/prd.rst b/documentation/prd.rst deleted file mode 100644 index fd76b51..0000000 --- a/documentation/prd.rst +++ /dev/null @@ -1,187 +0,0 @@ -.. vim: set fileencoding=utf-8: -.. -*- coding: utf-8 -*- -.. +--------------------------------------------------------------------------+ - | | - | Licensed under the Apache License, Version 2.0 (the "License"); | - | you may not use this file except in compliance with the License. | - | You may obtain a copy of the License at | - | | - | http://www.apache.org/licenses/LICENSE-2.0 | - | | - | Unless required by applicable law or agreed to in writing, software | - | distributed under the License is distributed on an "AS IS" BASIS, | - | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | - | See the License for the specific language governing permissions and | - | limitations under the License. | - | | - +--------------------------------------------------------------------------+ - - -******************************************************************************* -Product Requirements Document -******************************************************************************* - -Executive Summary -=============================================================================== - -The **detextive** library provides consolidated text detection and processing -capabilities to replace duplicated MIME type detection, charset detection, and -newline processing across multiple Python packages. It serves as a drop-in -replacement that standardizes textual content analysis with consistent APIs -and improved reliability. - -Problem Statement -=============================================================================== - -Multiple Python packages in the project ecosystem contain duplicated -implementations of text detection functionality: - -- **python-mimeogram**: MIME type and charset detection in acquirers.py and - parts.py -- **python-librovore**: Textual MIME type validation in cacheproxy.py -- **ai-experiments**: Charset detection and MIME type validation in - utilities.py - -This duplication creates maintenance overhead, inconsistent behavior, and -increases the likelihood of bugs. Each implementation has evolved separately -with different edge case handling and detection heuristics. - -Goals and Objectives -=============================================================================== - -**Primary Objectives**: - -* Consolidate text detection functionality into a single, well-tested library -* Provide drop-in replacement APIs that minimize migration effort -* Improve detection accuracy and consistency across all dependent packages - -**Secondary Objectives**: - -* Reduce maintenance overhead by eliminating code duplication -* Establish standardized text processing patterns for future projects -* Enable easier testing and validation of text detection logic - -**Success Metrics**: - -* All dependent packages successfully migrate with minimal code changes -* Detection accuracy matches or exceeds existing implementations -* Library passes comprehensive test suite covering edge cases - -Target Users -=============================================================================== - -**Primary Users**: - -* **Internal Developers**: Team members working on mimeogram, librovore, and - ai-experiments packages -* **Package Maintainers**: Developers responsible for library maintenance and - updates - -**Usage Context**: - -* Integration as a dependency in existing Python packages -* Programmatic text analysis and content processing workflows -* File and web content processing pipelines - -Functional Requirements -=============================================================================== - -**REQ-001: MIME Type Detection API** *(Critical)* - -As a developer, I want to detect MIME types from byte content so that I can -determine appropriate content handling strategies. - -*Acceptance Criteria*: -- Detect MIME types using content-based analysis (magic bytes) -- Fall back to file extension-based detection when content detection fails -- Support both file paths and raw byte content as input -- Return standardized MIME type strings (e.g., "text/plain", "application/json") - -**REQ-002: Charset Detection API** *(Critical)* - -As a developer, I want to detect character encoding from byte content so that -I can decode text properly without encoding errors. - -*Acceptance Criteria*: -- Auto-detect character encoding using statistical analysis -- Prefer UTF-8 when ASCII content could be either ASCII or UTF-8 -- Validate detected encodings by attempting decode operations -- Return encoding names compatible with Python's codec system - -**REQ-003: Line Separator Processing** *(Critical)* - -As a developer, I want to detect and normalize line separators so that I can -process text consistently across different platforms. - -*Acceptance Criteria*: -- Detect line separator types (CR, LF, CRLF) from byte or text content -- Normalize line endings to Unix LF format -- Convert line endings to platform-specific formats when needed -- Handle mixed line ending scenarios gracefully - -**REQ-004: Textual Content Validation** *(High)* - -As a developer, I want to determine if content represents meaningful text so -that I can avoid processing binary data as text. - -*Acceptance Criteria*: -- Classify MIME types as textual or non-textual -- Support extensible patterns for textual MIME type detection -- Validate decoded text content using heuristics (control character ratios, printable character ratios) -- Handle edge cases like empty content and single-character repetition - -**REQ-005: Drop-in Replacement Interface** *(High)* - -As a developer migrating existing code, I want compatible APIs so that I can -replace existing functions with minimal code changes. - -*Acceptance Criteria*: -- Maintain similar function signatures to existing implementations -- Support same input/output data types where possible -- Preserve existing behavior for common use cases -- Provide clear migration documentation for API differences - -Non-Functional Requirements -=============================================================================== - -**Performance Requirements**: -- MIME type detection should complete within 100ms for files up to 1MB -- Charset detection should analyze sufficient content sample (default 1KB) for accuracy -- Memory usage should remain proportional to sample size, not full file size - -**Reliability Requirements**: -- Library should handle malformed or unusual content without crashing -- Error conditions should be clearly communicated through appropriate exceptions -- Detection accuracy should be >= 95% for common text formats - -**Compatibility Requirements**: -- Support Python 3.8+ (matching existing package requirements) -- Compatible with existing dependency versions in target packages -- Platform-independent operation (Windows, macOS, Linux) - -Constraints and Assumptions -=============================================================================== - -**Technical Constraints**: -- Must integrate with existing package dependency management -- Limited to detection libraries already used in the ecosystem (chardet, puremagic) -- Cannot introduce breaking changes to existing public APIs during migration - -**Dependencies**: -- Migration requires coordination across multiple package maintainers -- Success depends on comprehensive test coverage of existing behavior -- Requires validation against real-world content from existing use cases - -**Assumptions**: -- Existing packages can accept new library dependency -- Current detection logic represents desired behavior (not bugs to be fixed) -- UTF-8 bias aligns with project content expectations - -Out of Scope -=============================================================================== - -* Content conversion or transformation beyond line ending normalization -* Support for legacy or exotic character encodings beyond what chardet provides -* MIME type validation or correction (library reports detected types as-is) -* Performance optimization for very large files (> 100MB) -* Integration with external content detection services or APIs \ No newline at end of file