jgm · jgm · Oct 23, 2021 · Oct 22, 2021 · Oct 22, 2021 · Oct 23, 2021
diff --git a/src/Text/DocLayout.hs b/src/Text/DocLayout.hs
@@ -1,13 +1,14 @@
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE BangPatterns      #-}
-{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE CPP               #-}
-{-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE DeriveGeneric     #-}
-{-# LANGUAGE DeriveFunctor     #-}
-{-# LANGUAGE DeriveFoldable    #-}
-{-# LANGUAGE DeriveTraversable #-}
+{-# LANGUAGE BangPatterns       #-}
+{-# LANGUAGE CPP                #-}
 {-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE DeriveFoldable     #-}
+{-# LANGUAGE DeriveFunctor      #-}
+{-# LANGUAGE DeriveGeneric      #-}
+{-# LANGUAGE DeriveTraversable  #-}
+{-# LANGUAGE FlexibleInstances  #-}
+{-# LANGUAGE MultiWayIf         #-}
+{-# LANGUAGE NoImplicitPrelude  #-}
+{-# LANGUAGE OverloadedStrings  #-}
 {- |
    Module      : Text.DocLayout
    Copyright   : Copyright (C) 2010-2019 John MacFarlane
@@ -700,12 +701,12 @@ realLength = realLengthNarrowContext
 -- | Get the real length of a string, taking into account combining and
 -- double-wide characters. Ambiguous characters are treated as width 1.
 realLengthNarrowContext :: HasChars a => a -> Int
-realLengthNarrowContext = realLengthWith . updateMatchState $ resolveWidth 1
+realLengthNarrowContext = realLengthWith updateMatchStateNarrow
 
 -- | Get the real length of a string, taking into account combining and
 -- double-wide characters. Ambiguous characters are treated as width 2.
 realLengthWideContext :: HasChars a => a -> Int
-realLengthWideContext = realLengthWith . updateMatchState $ resolveWidth 2
+realLengthWideContext = realLengthWith updateMatchStateWide
 
 -- | Resolve ambiguous characters based on their context.
 resolveWidth :: Int -> UnicodeWidth -> Int
@@ -725,58 +726,226 @@ realLengthWith f = extractLength . foldlChar f (MatchState True 0 False 0 mempty
 -- | Update a 'MatchState' by processing a character.
 -- For efficiency, we isolate commonly used portions of the basic
 -- multilingual plane that do not have emoji in them.
-updateMatchState :: (UnicodeWidth -> Int) -> MatchState -> Char -> MatchState
-updateMatchState resolve (MatchState first tot _ _ Nothing) !c
+-- This works in a narrow context.
+updateMatchStateNarrow :: MatchState -> Char -> MatchState
+updateMatchStateNarrow (MatchState first tot _ _ Nothing) !c
     -- Control characters have width 0: friends don't let friends use tabs
-    | c <= '\x001F'                     = controlState
+    | c <= '\x001F'  = controlState
     -- ASCII
-    | c <= '\x007E'                     = narrowState
-    -- Maximum contiguous range of width 2 containing CJK
-    | c >= '\x4DC0' && c <= '\x4DFF'    = narrowState     -- Hexagrams
-    | c >= '\x3250' && c <= '\xA4C6'    = wideState       -- Han ideographs
-    -- Hiragana and katakana
-    | c >= '\x3099' && c <= '\x309A'    = combiningState  -- Combining
-    | c == '\x303F'                     = narrowState     -- Half-fill space
-    | c >= '\x3030' && c <= '\x3247'    = wideState       -- Hiragana and Katakana
-    | c >= '\x3248' && c <= '\x324F'    = ambiguousState  -- Circled numbers
-    -- Combining characters have width 0
-    | c >= '\x0300' && c <= '\x036F'    = combiningState
-    | c >= '\x0483' && c <= '\x0489'    = combiningState
-    | c >= '\x0591' && c <= '\x05BD'    = combiningState
-    | c == '\x05BF'                     = combiningState
+    | c <= '\x007E'  = narrowState
+    -- More control characters
+    | c <= '\x009F'  = controlState
+    -- Extended Latin: Latin 1-supplement, Extended-A, Extended-B, IPA Extensions.
+    -- This block is full of ambiguous characters, so these shortcuts will not
+    -- work in a wide context.
+    | c == '\x00AD'  = controlState    -- Soft hyphen
+    | c <= '\x02FF'  = narrowState
+    -- Combining diacritical marks used in Latin and other scripts
+    | c <= '\x036F'  = combiningState
+    -- Han ideographs
+    | c >= '\x3250' && c <= '\xA4CF' =
+        if | c <= '\x4DBF' -> wideState       -- Han ideographs
+           | c <= '\x4DFF' -> narrowState     -- Hexagrams
+           | otherwise     -> wideState       -- More Han ideographs
+    -- Arabic
+    | c >= '\x0600' && c <= '\x06FF' =
+        if | c <= '\x0605' -> controlState    -- Number marks
+           | c <= '\x060F' -> narrowState     -- Punctuation and marks
+           | c <= '\x061A' -> combiningState  -- Combining marks
+           | c == '\x061B' -> narrowState     -- Arabic semicolon
+           | c <= '\x061C' -> controlState    -- Letter mark
+           | c <= '\x064A' -> narrowState     -- Main Arabic abjad
+           | c <= '\x065F' -> combiningState  -- Arabic vowel markers
+           | c == '\x0670' -> combiningState  -- Superscript alef
+           | c <= '\x06D5' -> narrowState     -- Arabic digits and letters used in other languages
+           | c <= '\x06DC' -> combiningState  -- Small high ligatures
+           | c == '\x06DD' -> controlState    -- End of ayah
+           | c == '\x06DE' -> narrowState     -- Start of rub el hizb
+           | c <= '\x06E4' -> combiningState  -- More small high ligatures
+           | c <= '\x06E6' -> narrowState     -- Small vowels
+           | c == '\x06E9' -> narrowState     -- Place of sajdah
+           | c <= '\x06ED' -> combiningState  -- More combining
+           | otherwise     -> narrowState     -- All the rest
+    -- Devanagari (plus one Bengali character)
+    | c >= '\x0900' && c <= '\x0980' =
+        if | c <= '\x0903' -> combiningState  -- Combining characters
+           | c <= '\x0939' -> narrowState     -- Main Devanagari abugida
+           | c == '\x093D' -> narrowState     -- Devanagari avagraha
+           | c == '\x0950' -> narrowState     -- Devanagari om
+           | c <= '\x0957' -> combiningState  -- Combining characters
+           | c == '\x0962' -> combiningState  -- Combining character
+           | c == '\x0963' -> combiningState  -- Combining character
+           | otherwise     -> narrowState     -- Devanagari digits and up to beginning of Bengali block
+    -- Bengali (plus a couple Gurmukhi characters)
+    | c >= '\x0981' && c <= '\x0A03' =
+        if | c <= '\x0983' -> combiningState  -- Combining signs
+           | c <= '\x09B9' -> narrowState     -- Main Bengali abugida
+           | c == '\x09BD' -> narrowState     -- Bengali avagraha
+           | c == '\x09CE' -> narrowState     -- Bengali khanda ta
+           | c <= '\x09D7' -> combiningState  -- Combining marks
+           | c == '\x09E2' -> combiningState  -- Bengali vocalic vowel signs
+           | c == '\x09E3' -> combiningState  -- Bengali vocalic vowel signs
+           | c <= '\x09FD' -> narrowState     -- Bengali digits and other symbols
+           | otherwise     -> combiningState  -- Bengali sandhi mark, plus a few symbols from Gurmukhi
+    -- Cyrillic (plus Greek and Armenian for free)
+    -- This block has many ambiguous characters, and so cannot be used in wide contexts
+    | c >= '\x0370' && c <= '\x058F' =
+        if | c <= '\x0482' -> narrowState     -- Main Greek and Cyrillic block
+           | c <= '\x0489' -> combiningState  -- Cyrillic combining characters
+           | otherwise     -> narrowState     -- Extra Cyrillic characters used in Ukrainian and others, plus Armenian
+    -- Japanese
+    | c >= '\x2E80' && c <= '\x324F' =
+        if | c <= '\x3029' -> wideState       -- Punctuation and others
+           | c <= '\x302F' -> combiningState  -- Tone marks
+           | c == '\x303F' -> narrowState     -- Half-fill space
+           | c <= '\x3096' -> wideState       -- Hiragana and others
+           | c <= '\x309A' -> combiningState  -- Hiragana voiced marks
+           | c <= '\x3247' -> wideState       -- Katakana plus compatibility Jamo for Korean
+           | otherwise     -> ambiguousState  -- Circled numbers
+    -- Korean
+    | c >= '\xAC00' && c <= '\xD7A3' = wideState  -- Precomposed Hangul
+    -- Telugu (plus one character of Kannada)
+    | c >= '\x0C00' && c <= '\x0C80' =
+        if | c <= '\x0C04' -> combiningState  -- Combining characters
+           | c <= '\x0C39' -> narrowState     -- Main Telugu abugida
+           | c == '\x0C3D' -> narrowState     -- Telugu avagraha
+           | c <= '\x0C56' -> combiningState  -- Vowel markers
+           | c == '\x0C62' -> combiningState  -- Combining character
+           | c == '\x0C63' -> combiningState  -- Combining character
+           | otherwise     -> narrowState     -- Telugu digits
+    -- Tamil
+    | c >= '\x0B80' && c <= '\x0BFF' =
+        if | c <= '\x0B82' -> combiningState  -- Combining characters
+           | c <= '\x0BB9' -> narrowState     -- Main Tamil abugida
+           | c <= '\x0BCD' -> combiningState  -- Vowel markers
+           | c == '\x0BD7' -> combiningState  -- Combining character
+           | otherwise     -> narrowState     -- Tamil digits and others
   where
     narrowState    = MatchState False (tot + 1) True 0 Nothing
     wideState      = MatchState False (tot + 2) False 0 Nothing
     combiningState = let w = if first then 1 else 0 in MatchState False (tot + w) False 0 Nothing
     controlState   = MatchState False tot False 0 Nothing
-    ambiguousState = let w = resolve Ambiguous in MatchState False (tot + w) False 0 Nothing
-updateMatchState resolve s c            = updateMatchStateNoShortcut resolve s c
+    ambiguousState = MatchState False (tot + 1) False 0 Nothing
+updateMatchStateNarrow s c = updateMatchStateNoShortcut 1 s c
+
+-- | Update a 'MatchState' by processing a character.
+-- For efficiency, we isolate commonly used portions of the basic
+-- multilingual plane that do not have emoji in them.
+-- This works in a wide context.
+updateMatchStateWide :: MatchState -> Char -> MatchState
+updateMatchStateWide (MatchState first tot _ _ Nothing) !c
+    -- Control characters have width 0: friends don't let friends use tabs
+    | c <= '\x001F'  = controlState
+    -- ASCII
+    | c <= '\x007E'  = narrowState
+    -- Han ideographs
+    | c >= '\x3250' && c <= '\xA4CF' =
+        if | c <= '\x4DBF' -> wideState       -- Han ideographs
+           | c <= '\x4DFF' -> narrowState     -- Hexagrams
+           | otherwise     -> wideState       -- More Han ideographs
+    -- Japanese
+    | c >= '\x2E80' && c <= '\x324F' =
+        if | c <= '\x3029' -> wideState       -- Punctuation and others
+           | c <= '\x302F' -> combiningState  -- Tone marks
+           | c == '\x303F' -> narrowState     -- Half-fill space
+           | c <= '\x3096' -> wideState       -- Hiragana and others
+           | c <= '\x309A' -> combiningState  -- Hiragana voiced marks
+           | c <= '\x3247' -> wideState       -- Katakana plus compatibility Jamo for Korean
+           | otherwise     -> ambiguousState  -- Circled numbers
+    -- Korean
+    | c >= '\xAC00' && c <= '\xD7A3' = wideState  -- Precomposed Hangul
+    -- Combining diacritical marks used in Latin and other scripts
+    | c >= '\x0300' && c <= '\x036F'  = combiningState
+    -- Arabic
+    | c >= '\x0600' && c <= '\x06FF' =
+        if | c <= '\x0605' -> controlState    -- Number marks
+           | c <= '\x060F' -> narrowState     -- Punctuation and marks
+           | c <= '\x061A' -> combiningState  -- Combining marks
+           | c == '\x061B' -> narrowState     -- Arabic semicolon
+           | c <= '\x061C' -> controlState    -- Letter mark
+           | c <= '\x064A' -> narrowState     -- Main Arabic abjad
+           | c <= '\x065F' -> combiningState  -- Arabic vowel markers
+           | c == '\x0670' -> combiningState  -- Superscript alef
+           | c <= '\x06D5' -> narrowState     -- Arabic digits and letters used in other languages
+           | c <= '\x06DC' -> combiningState  -- Small high ligatures
+           | c == '\x06DD' -> controlState    -- End of ayah
+           | c == '\x06DE' -> narrowState     -- Start of rub el hizb
+           | c <= '\x06E4' -> combiningState  -- More small high ligatures
+           | c <= '\x06E6' -> narrowState     -- Small vowels
+           | c == '\x06E9' -> narrowState     -- Place of sajdah
+           | c <= '\x06ED' -> combiningState  -- More combining
+           | otherwise     -> narrowState     -- All the rest
+    -- Devanagari (plus one Bengali character)
+    | c >= '\x0900' && c <= '\x0980' =
+        if | c <= '\x0903' -> combiningState  -- Combining characters
+           | c <= '\x0939' -> narrowState     -- Main Devanagari abugida
+           | c == '\x093D' -> narrowState     -- Devanagari avagraha
+           | c == '\x0950' -> narrowState     -- Devanagari om
+           | c <= '\x0957' -> combiningState  -- Combining characters
+           | c == '\x0962' -> combiningState  -- Combining character
+           | c == '\x0963' -> combiningState  -- Combining character
+           | otherwise     -> narrowState     -- Devanagari digits and up to beginning of Bengali block
+    -- Bengali (plus a couple Gurmukhi characters)
+    | c >= '\x0981' && c <= '\x0A03' =
+        if | c <= '\x0983' -> combiningState  -- Combining signs
+           | c <= '\x09B9' -> narrowState     -- Main Bengali abugida
+           | c == '\x09BD' -> narrowState     -- Bengali avagraha
+           | c == '\x09CE' -> narrowState     -- Bengali khanda ta
+           | c <= '\x09D7' -> combiningState  -- Combining marks
+           | c == '\x09E2' -> combiningState  -- Bengali vocalic vowel signs
+           | c == '\x09E3' -> combiningState  -- Bengali vocalic vowel signs
+           | c <= '\x09FD' -> narrowState     -- Bengali digits and other symbols
+           | otherwise     -> combiningState  -- Bengali sandhi mark, plus a few symbols from Gurmukhi
+    -- Telugu (plus one character of Kannada)
+    | c >= '\x0C00' && c <= '\x0C80' =
+        if | c <= '\x0C04' -> combiningState  -- Combining characters
+           | c <= '\x0C39' -> narrowState     -- Main Telugu abugida
+           | c == '\x0C3D' -> narrowState     -- Telugu avagraha
+           | c <= '\x0C56' -> combiningState  -- Vowel markers
+           | c == '\x0C62' -> combiningState  -- Combining character
+           | c == '\x0C63' -> combiningState  -- Combining character
+           | otherwise     -> narrowState     -- Telugu digits
+    -- Tamil
+    | c >= '\x0B80' && c <= '\x0BFF' =
+        if | c <= '\x0B82' -> combiningState  -- Combining characters
+           | c <= '\x0BB9' -> narrowState     -- Main Tamil abugida
+           | c <= '\x0BCD' -> combiningState  -- Vowel markers
+           | c == '\x0BD7' -> combiningState  -- Combining character
+           | otherwise     -> narrowState     -- Tamil digits and others
+  where
+    narrowState    = MatchState False (tot + 1) True 0 Nothing
+    wideState      = MatchState False (tot + 2) False 0 Nothing
+    combiningState = let w = if first then 1 else 0 in MatchState False (tot + w) False 0 Nothing
+    controlState   = MatchState False tot False 0 Nothing
+    ambiguousState = MatchState False (tot + 2) False 0 Nothing
+updateMatchStateWide s c = updateMatchStateNoShortcut 2 s c
 
 -- | Update a 'MatchState' by processing a character, without taking any
 -- shortcuts. This should give the same answer as 'updateMatchState', but will
 -- be slower. It is here to test that the shortcuts are implemented correctly.
-updateMatchStateNoShortcut :: (UnicodeWidth -> Int) -> MatchState -> Char -> MatchState
-updateMatchStateNoShortcut resolve (MatchState first tot lastNarrow _ Nothing) !c
+updateMatchStateNoShortcut :: Int -> MatchState -> Char -> MatchState
+updateMatchStateNoShortcut !ambiguous (MatchState first tot lastNarrow _ Nothing) !c
     | isEmojiVariation c = MatchState False (if lastNarrow then tot + 1 else tot) False 0 Nothing
     | otherwise = case IM.lookupLE oc unicodeWidthMap of
         -- If there is a specific match, record the tentative width, the map of
         -- continuations, and move to the next character
         Just (!oc', SpecificMatch r w m) | oc == oc' ->
             let r' = fromMaybe r w
-            in MatchState False tot (r' == Narrow) (resolve r') (Just m)
+             in MatchState False tot (isNarrow r') (resolveWidth ambiguous r') (Just m)
         -- If there is only a range match, record the total width and move to
         -- the next character
         Just (!_, !match) ->
             let r = rangeWidth match
                 -- If the string starts with a combining character.  Since there is no
                 -- preceding character, we count 0 width as 1 in this one case:
-                r' = resolve $ if first && r == Combining then Narrow else r
-            in MatchState False (tot + r') (r == Narrow) 0 Nothing
+                r' = resolveWidth ambiguous $ if first && r == Combining then Narrow else r
+            in MatchState False (tot + r') (isNarrow r) 0 Nothing
         -- M.lookupLE should not fail
         Nothing -> MatchState False (tot + 1) False 0 Nothing
   where
+    isNarrow x = x == Narrow || (x == Ambiguous && ambiguous == 1)
     oc = ord c
-updateMatchStateNoShortcut resolve (MatchState _ tot _ tent (Just !m)) !c
+updateMatchStateNoShortcut !ambiguous (MatchState _ tot _ tent (Just !m)) !c
     -- Variation modifiers modify the emoji up to this point, so can be
     -- discarded. However, they always make it width 2, so we set the tentative
     -- width to 2.
@@ -794,7 +963,9 @@ updateMatchStateNoShortcut resolve (MatchState _ tot _ tent (Just !m)) !c
         Just m' -> MatchState False tot False 2 (Just m')
         -- No continuations match, use the tentative width and process c without continuations
         -- I guess we use shortcuts here; that's probably fine.
-        Nothing -> updateMatchState resolve (MatchState False (tot + tent) False 0 Nothing) c
+        Nothing -> if ambiguous == 1
+                      then updateMatchStateNarrow (MatchState False (tot + tent) False 0 Nothing) c
+                      else updateMatchStateWide (MatchState False (tot + tent) False 0 Nothing) c
 
 -- | Keeps track of state in length calculations, determining whether we're at
 -- the first character, the width so far, possibly a tentative width for this

diff --git a/test/test.hs b/test/test.hs
@@ -313,8 +313,8 @@ tests =
       zwjEmojis <&> \emoji -> testCase (T.unpack emoji) $ realLength emoji @?= 2
 
   , testProperty "shortcut provides same answer for string length in a narrow context" . withMaxSuccess 1000000 $
-      \(x :: String) -> realLengthNarrowContext x === realLengthWith (updateMatchStateNoShortcut (resolveWidth 1)) x
+      \(x :: String) -> realLengthNarrowContext x === realLengthWith (updateMatchStateNoShortcut 1) x
 
   , testProperty "shortcut provides same answer for string length in a wide context" . withMaxSuccess 1000000 $
-      \(x :: String) -> realLengthWideContext x === realLengthWith (updateMatchStateNoShortcut (resolveWidth 2)) x
+      \(x :: String) -> realLengthWideContext x === realLengthWith (updateMatchStateNoShortcut 2) x
   ]