Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 208 additions & 37 deletions src/Text/DocLayout.hs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveFoldable #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveFoldable #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.DocLayout
Copyright : Copyright (C) 2010-2019 John MacFarlane
Expand Down Expand Up @@ -700,12 +701,12 @@ realLength = realLengthNarrowContext
-- | Get the real length of a string, taking into account combining and
-- double-wide characters. Ambiguous characters are treated as width 1.
realLengthNarrowContext :: HasChars a => a -> Int
realLengthNarrowContext = realLengthWith . updateMatchState $ resolveWidth 1
realLengthNarrowContext = realLengthWith updateMatchStateNarrow

-- | Get the real length of a string, taking into account combining and
-- double-wide characters. Ambiguous characters are treated as width 2.
realLengthWideContext :: HasChars a => a -> Int
realLengthWideContext = realLengthWith . updateMatchState $ resolveWidth 2
realLengthWideContext = realLengthWith updateMatchStateWide

-- | Resolve ambiguous characters based on their context.
resolveWidth :: Int -> UnicodeWidth -> Int
Expand All @@ -725,58 +726,226 @@ realLengthWith f = extractLength . foldlChar f (MatchState True 0 False 0 mempty
-- | Update a 'MatchState' by processing a character.
-- For efficiency, we isolate commonly used portions of the basic
-- multilingual plane that do not have emoji in them.
updateMatchState :: (UnicodeWidth -> Int) -> MatchState -> Char -> MatchState
updateMatchState resolve (MatchState first tot _ _ Nothing) !c
-- This works in a narrow context.
updateMatchStateNarrow :: MatchState -> Char -> MatchState
updateMatchStateNarrow (MatchState first tot _ _ Nothing) !c
-- Control characters have width 0: friends don't let friends use tabs
| c <= '\x001F' = controlState
| c <= '\x001F' = controlState
-- ASCII
| c <= '\x007E' = narrowState
-- Maximum contiguous range of width 2 containing CJK
| c >= '\x4DC0' && c <= '\x4DFF' = narrowState -- Hexagrams
| c >= '\x3250' && c <= '\xA4C6' = wideState -- Han ideographs
-- Hiragana and katakana
| c >= '\x3099' && c <= '\x309A' = combiningState -- Combining
| c == '\x303F' = narrowState -- Half-fill space
| c >= '\x3030' && c <= '\x3247' = wideState -- Hiragana and Katakana
| c >= '\x3248' && c <= '\x324F' = ambiguousState -- Circled numbers
-- Combining characters have width 0
| c >= '\x0300' && c <= '\x036F' = combiningState
| c >= '\x0483' && c <= '\x0489' = combiningState
| c >= '\x0591' && c <= '\x05BD' = combiningState
| c == '\x05BF' = combiningState
| c <= '\x007E' = narrowState
-- More control characters
| c <= '\x009F' = controlState
-- Extended Latin: Latin 1-supplement, Extended-A, Extended-B, IPA Extensions.
-- This block is full of ambiguous characters, so these shortcuts will not
-- work in a wide context.
| c == '\x00AD' = controlState -- Soft hyphen
| c <= '\x02FF' = narrowState
-- Combining diacritical marks used in Latin and other scripts
| c <= '\x036F' = combiningState
-- Han ideographs
| c >= '\x3250' && c <= '\xA4CF' =
if | c <= '\x4DBF' -> wideState -- Han ideographs
| c <= '\x4DFF' -> narrowState -- Hexagrams
| otherwise -> wideState -- More Han ideographs
-- Arabic
| c >= '\x0600' && c <= '\x06FF' =
if | c <= '\x0605' -> controlState -- Number marks
| c <= '\x060F' -> narrowState -- Punctuation and marks
| c <= '\x061A' -> combiningState -- Combining marks
| c == '\x061B' -> narrowState -- Arabic semicolon
| c <= '\x061C' -> controlState -- Letter mark
| c <= '\x064A' -> narrowState -- Main Arabic abjad
| c <= '\x065F' -> combiningState -- Arabic vowel markers
| c == '\x0670' -> combiningState -- Superscript alef
| c <= '\x06D5' -> narrowState -- Arabic digits and letters used in other languages
| c <= '\x06DC' -> combiningState -- Small high ligatures
| c == '\x06DD' -> controlState -- End of ayah
| c == '\x06DE' -> narrowState -- Start of rub el hizb
| c <= '\x06E4' -> combiningState -- More small high ligatures
| c <= '\x06E6' -> narrowState -- Small vowels
| c == '\x06E9' -> narrowState -- Place of sajdah
| c <= '\x06ED' -> combiningState -- More combining
| otherwise -> narrowState -- All the rest
-- Devanagari (plus one Bengali character)
| c >= '\x0900' && c <= '\x0980' =
if | c <= '\x0903' -> combiningState -- Combining characters
| c <= '\x0939' -> narrowState -- Main Devanagari abugida
| c == '\x093D' -> narrowState -- Devanagari avagraha
| c == '\x0950' -> narrowState -- Devanagari om
| c <= '\x0957' -> combiningState -- Combining characters
| c == '\x0962' -> combiningState -- Combining character
| c == '\x0963' -> combiningState -- Combining character
| otherwise -> narrowState -- Devanagari digits and up to beginning of Bengali block
-- Bengali (plus a couple Gurmukhi characters)
| c >= '\x0981' && c <= '\x0A03' =
if | c <= '\x0983' -> combiningState -- Combining signs
| c <= '\x09B9' -> narrowState -- Main Bengali abugida
| c == '\x09BD' -> narrowState -- Bengali avagraha
| c == '\x09CE' -> narrowState -- Bengali khanda ta
| c <= '\x09D7' -> combiningState -- Combining marks
| c == '\x09E2' -> combiningState -- Bengali vocalic vowel signs
| c == '\x09E3' -> combiningState -- Bengali vocalic vowel signs
| c <= '\x09FD' -> narrowState -- Bengali digits and other symbols
| otherwise -> combiningState -- Bengali sandhi mark, plus a few symbols from Gurmukhi
-- Cyrillic (plus Greek and Armenian for free)
-- This block has many ambiguous characters, and so cannot be used in wide contexts
| c >= '\x0370' && c <= '\x058F' =
if | c <= '\x0482' -> narrowState -- Main Greek and Cyrillic block
| c <= '\x0489' -> combiningState -- Cyrillic combining characters
| otherwise -> narrowState -- Extra Cyrillic characters used in Ukrainian and others, plus Armenian
-- Japanese
| c >= '\x2E80' && c <= '\x324F' =
if | c <= '\x3029' -> wideState -- Punctuation and others
| c <= '\x302F' -> combiningState -- Tone marks
| c == '\x303F' -> narrowState -- Half-fill space
| c <= '\x3096' -> wideState -- Hiragana and others
| c <= '\x309A' -> combiningState -- Hiragana voiced marks
| c <= '\x3247' -> wideState -- Katakana plus compatibility Jamo for Korean
| otherwise -> ambiguousState -- Circled numbers
-- Korean
| c >= '\xAC00' && c <= '\xD7A3' = wideState -- Precomposed Hangul
-- Telugu (plus one character of Kannada)
| c >= '\x0C00' && c <= '\x0C80' =
if | c <= '\x0C04' -> combiningState -- Combining characters
| c <= '\x0C39' -> narrowState -- Main Telugu abugida
| c == '\x0C3D' -> narrowState -- Telugu avagraha
| c <= '\x0C56' -> combiningState -- Vowel markers
| c == '\x0C62' -> combiningState -- Combining character
| c == '\x0C63' -> combiningState -- Combining character
| otherwise -> narrowState -- Telugu digits
-- Tamil
| c >= '\x0B80' && c <= '\x0BFF' =
if | c <= '\x0B82' -> combiningState -- Combining characters
| c <= '\x0BB9' -> narrowState -- Main Tamil abugida
| c <= '\x0BCD' -> combiningState -- Vowel markers
| c == '\x0BD7' -> combiningState -- Combining character
| otherwise -> narrowState -- Tamil digits and others
where
narrowState = MatchState False (tot + 1) True 0 Nothing
wideState = MatchState False (tot + 2) False 0 Nothing
combiningState = let w = if first then 1 else 0 in MatchState False (tot + w) False 0 Nothing
controlState = MatchState False tot False 0 Nothing
ambiguousState = let w = resolve Ambiguous in MatchState False (tot + w) False 0 Nothing
updateMatchState resolve s c = updateMatchStateNoShortcut resolve s c
ambiguousState = MatchState False (tot + 1) False 0 Nothing
updateMatchStateNarrow s c = updateMatchStateNoShortcut 1 s c

-- | Update a 'MatchState' by processing a character.
-- For efficiency, we isolate commonly used portions of the basic
-- multilingual plane that do not have emoji in them.
-- This works in a wide context.
updateMatchStateWide :: MatchState -> Char -> MatchState
updateMatchStateWide (MatchState first tot _ _ Nothing) !c
-- Control characters have width 0: friends don't let friends use tabs
| c <= '\x001F' = controlState
-- ASCII
| c <= '\x007E' = narrowState
-- Han ideographs
| c >= '\x3250' && c <= '\xA4CF' =
if | c <= '\x4DBF' -> wideState -- Han ideographs
| c <= '\x4DFF' -> narrowState -- Hexagrams
| otherwise -> wideState -- More Han ideographs
-- Japanese
| c >= '\x2E80' && c <= '\x324F' =
if | c <= '\x3029' -> wideState -- Punctuation and others
| c <= '\x302F' -> combiningState -- Tone marks
| c == '\x303F' -> narrowState -- Half-fill space
| c <= '\x3096' -> wideState -- Hiragana and others
| c <= '\x309A' -> combiningState -- Hiragana voiced marks
| c <= '\x3247' -> wideState -- Katakana plus compatibility Jamo for Korean
| otherwise -> ambiguousState -- Circled numbers
-- Korean
| c >= '\xAC00' && c <= '\xD7A3' = wideState -- Precomposed Hangul
-- Combining diacritical marks used in Latin and other scripts
| c >= '\x0300' && c <= '\x036F' = combiningState
-- Arabic
| c >= '\x0600' && c <= '\x06FF' =
if | c <= '\x0605' -> controlState -- Number marks
| c <= '\x060F' -> narrowState -- Punctuation and marks
| c <= '\x061A' -> combiningState -- Combining marks
| c == '\x061B' -> narrowState -- Arabic semicolon
| c <= '\x061C' -> controlState -- Letter mark
| c <= '\x064A' -> narrowState -- Main Arabic abjad
| c <= '\x065F' -> combiningState -- Arabic vowel markers
| c == '\x0670' -> combiningState -- Superscript alef
| c <= '\x06D5' -> narrowState -- Arabic digits and letters used in other languages
| c <= '\x06DC' -> combiningState -- Small high ligatures
| c == '\x06DD' -> controlState -- End of ayah
| c == '\x06DE' -> narrowState -- Start of rub el hizb
| c <= '\x06E4' -> combiningState -- More small high ligatures
| c <= '\x06E6' -> narrowState -- Small vowels
| c == '\x06E9' -> narrowState -- Place of sajdah
| c <= '\x06ED' -> combiningState -- More combining
| otherwise -> narrowState -- All the rest
-- Devanagari (plus one Bengali character)
| c >= '\x0900' && c <= '\x0980' =
if | c <= '\x0903' -> combiningState -- Combining characters
| c <= '\x0939' -> narrowState -- Main Devanagari abugida
| c == '\x093D' -> narrowState -- Devanagari avagraha
| c == '\x0950' -> narrowState -- Devanagari om
| c <= '\x0957' -> combiningState -- Combining characters
| c == '\x0962' -> combiningState -- Combining character
| c == '\x0963' -> combiningState -- Combining character
| otherwise -> narrowState -- Devanagari digits and up to beginning of Bengali block
-- Bengali (plus a couple Gurmukhi characters)
| c >= '\x0981' && c <= '\x0A03' =
if | c <= '\x0983' -> combiningState -- Combining signs
| c <= '\x09B9' -> narrowState -- Main Bengali abugida
| c == '\x09BD' -> narrowState -- Bengali avagraha
| c == '\x09CE' -> narrowState -- Bengali khanda ta
| c <= '\x09D7' -> combiningState -- Combining marks
| c == '\x09E2' -> combiningState -- Bengali vocalic vowel signs
| c == '\x09E3' -> combiningState -- Bengali vocalic vowel signs
| c <= '\x09FD' -> narrowState -- Bengali digits and other symbols
| otherwise -> combiningState -- Bengali sandhi mark, plus a few symbols from Gurmukhi
-- Telugu (plus one character of Kannada)
| c >= '\x0C00' && c <= '\x0C80' =
if | c <= '\x0C04' -> combiningState -- Combining characters
| c <= '\x0C39' -> narrowState -- Main Telugu abugida
| c == '\x0C3D' -> narrowState -- Telugu avagraha
| c <= '\x0C56' -> combiningState -- Vowel markers
| c == '\x0C62' -> combiningState -- Combining character
| c == '\x0C63' -> combiningState -- Combining character
| otherwise -> narrowState -- Telugu digits
-- Tamil
| c >= '\x0B80' && c <= '\x0BFF' =
if | c <= '\x0B82' -> combiningState -- Combining characters
| c <= '\x0BB9' -> narrowState -- Main Tamil abugida
| c <= '\x0BCD' -> combiningState -- Vowel markers
| c == '\x0BD7' -> combiningState -- Combining character
| otherwise -> narrowState -- Tamil digits and others
where
narrowState = MatchState False (tot + 1) True 0 Nothing
wideState = MatchState False (tot + 2) False 0 Nothing
combiningState = let w = if first then 1 else 0 in MatchState False (tot + w) False 0 Nothing
controlState = MatchState False tot False 0 Nothing
ambiguousState = MatchState False (tot + 2) False 0 Nothing
updateMatchStateWide s c = updateMatchStateNoShortcut 2 s c

-- | Update a 'MatchState' by processing a character, without taking any
-- shortcuts. This should give the same answer as 'updateMatchState', but will
-- be slower. It is here to test that the shortcuts are implemented correctly.
updateMatchStateNoShortcut :: (UnicodeWidth -> Int) -> MatchState -> Char -> MatchState
updateMatchStateNoShortcut resolve (MatchState first tot lastNarrow _ Nothing) !c
updateMatchStateNoShortcut :: Int -> MatchState -> Char -> MatchState
updateMatchStateNoShortcut !ambiguous (MatchState first tot lastNarrow _ Nothing) !c
| isEmojiVariation c = MatchState False (if lastNarrow then tot + 1 else tot) False 0 Nothing
| otherwise = case IM.lookupLE oc unicodeWidthMap of
-- If there is a specific match, record the tentative width, the map of
-- continuations, and move to the next character
Just (!oc', SpecificMatch r w m) | oc == oc' ->
let r' = fromMaybe r w
in MatchState False tot (r' == Narrow) (resolve r') (Just m)
in MatchState False tot (isNarrow r') (resolveWidth ambiguous r') (Just m)
-- If there is only a range match, record the total width and move to
-- the next character
Just (!_, !match) ->
let r = rangeWidth match
-- If the string starts with a combining character. Since there is no
-- preceding character, we count 0 width as 1 in this one case:
r' = resolve $ if first && r == Combining then Narrow else r
in MatchState False (tot + r') (r == Narrow) 0 Nothing
r' = resolveWidth ambiguous $ if first && r == Combining then Narrow else r
in MatchState False (tot + r') (isNarrow r) 0 Nothing
-- M.lookupLE should not fail
Nothing -> MatchState False (tot + 1) False 0 Nothing
where
isNarrow x = x == Narrow || (x == Ambiguous && ambiguous == 1)
oc = ord c
updateMatchStateNoShortcut resolve (MatchState _ tot _ tent (Just !m)) !c
updateMatchStateNoShortcut !ambiguous (MatchState _ tot _ tent (Just !m)) !c
-- Variation modifiers modify the emoji up to this point, so can be
-- discarded. However, they always make it width 2, so we set the tentative
-- width to 2.
Expand All @@ -794,7 +963,9 @@ updateMatchStateNoShortcut resolve (MatchState _ tot _ tent (Just !m)) !c
Just m' -> MatchState False tot False 2 (Just m')
-- No continuations match, use the tentative width and process c without continuations
-- I guess we use shortcuts here; that's probably fine.
Nothing -> updateMatchState resolve (MatchState False (tot + tent) False 0 Nothing) c
Nothing -> if ambiguous == 1
then updateMatchStateNarrow (MatchState False (tot + tent) False 0 Nothing) c
else updateMatchStateWide (MatchState False (tot + tent) False 0 Nothing) c

-- | Keeps track of state in length calculations, determining whether we're at
-- the first character, the width so far, possibly a tentative width for this
Expand Down
4 changes: 2 additions & 2 deletions test/test.hs
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,8 @@ tests =
zwjEmojis <&> \emoji -> testCase (T.unpack emoji) $ realLength emoji @?= 2

, testProperty "shortcut provides same answer for string length in a narrow context" . withMaxSuccess 1000000 $
\(x :: String) -> realLengthNarrowContext x === realLengthWith (updateMatchStateNoShortcut (resolveWidth 1)) x
\(x :: String) -> realLengthNarrowContext x === realLengthWith (updateMatchStateNoShortcut 1) x

, testProperty "shortcut provides same answer for string length in a wide context" . withMaxSuccess 1000000 $
\(x :: String) -> realLengthWideContext x === realLengthWith (updateMatchStateNoShortcut (resolveWidth 2)) x
\(x :: String) -> realLengthWideContext x === realLengthWith (updateMatchStateNoShortcut 2) x
]