@@ -19,7 +19,9 @@ type t = {
1919 Diagnostics .category ->
2020 unit ;
2121 mutable ch : charEncoding ; (* current character *)
22- mutable offset : int ; (* character offset *)
22+ mutable offset : int ; (* current byte offset *)
23+ mutable offset16 : int ;
24+ (* current number of utf16 code units since line start *)
2325 mutable lineOffset : int ; (* current line offset *)
2426 mutable lnum : int ; (* current line number *)
2527 mutable mode : mode list ;
@@ -51,12 +53,11 @@ let position scanner =
5153 (* line number *)
5254 pos_lnum = scanner.lnum;
5355 (* offset of the beginning of the line (number
54- of characters between the beginning of the scanner and the beginning
56+ of bytes between the beginning of the scanner and the beginning
5557 of the line) *)
5658 pos_bol = scanner.lineOffset;
57- (* [pos_cnum] is the offset of the position (number of
58- characters between the beginning of the scanner and the position). *)
59- pos_cnum = scanner.offset;
59+ (* [pos_cnum - pos_bol] is the number of utf16 code units since line start *)
60+ pos_cnum = scanner.lineOffset + scanner.offset16;
6061 }
6162
6263(* Small debugging util
@@ -95,19 +96,29 @@ let _printDebug ~startPos ~endPos scanner token =
9596
9697let next scanner =
9798 let nextOffset = scanner.offset + 1 in
98- (match scanner.ch with
99- | '\n' ->
100- scanner.lineOffset < - nextOffset;
101- scanner.lnum < - scanner.lnum + 1
99+ let utf16len =
100+ match Ext_utf8. classify scanner.ch with
101+ | Single _ | Invalid -> 1
102+ | Leading (n , _ ) -> ( (((n + 1 ) / 2 ) [@ doesNotRaise]))
103+ | Cont _ -> 0
104+ in
105+ let newline =
106+ scanner.ch = '\n'
102107 (* What about CRLF (\r + \n) on windows?
103- * \r\n will always be terminated by a \n
104- * -> we can just bump the line count on \n *)
105- | _ -> () );
108+ \r\n will always be terminated by a \n
109+ -> we can just bump the line count on \n *)
110+ in
111+ if newline then (
112+ scanner.lineOffset < - nextOffset;
113+ scanner.offset16 < - 0 ;
114+ scanner.lnum < - scanner.lnum + 1 )
115+ else scanner.offset16 < - scanner.offset16 + utf16len;
106116 if nextOffset < String. length scanner.src then (
107117 scanner.offset < - nextOffset;
108- scanner.ch < - String. unsafe_get scanner.src scanner.offset )
118+ scanner.ch < - String. unsafe_get scanner.src nextOffset )
109119 else (
110120 scanner.offset < - String. length scanner.src;
121+ scanner.offset16 < - scanner.offset - scanner.lineOffset;
111122 scanner.ch < - hackyEOFChar)
112123
113124let next2 scanner =
@@ -141,6 +152,7 @@ let make ~filename src =
141152 err = (fun ~startPos :_ ~endPos :_ _ -> () );
142153 ch = (if src = " " then hackyEOFChar else String. unsafe_get src 0 );
143154 offset = 0 ;
155+ offset16 = 0 ;
144156 lineOffset = 0 ;
145157 lnum = 1 ;
146158 mode = [] ;
@@ -847,6 +859,7 @@ let rec scan scanner =
847859 | ch , _ ->
848860 next scanner;
849861 let offset = scanner.offset in
862+ let offset16 = scanner.offset16 in
850863 let codepoint, length =
851864 Res_utf8. decodeCodePoint scanner.offset scanner.src
852865 (String. length scanner.src)
@@ -863,6 +876,7 @@ let rec scan scanner =
863876 else (
864877 scanner.ch < - ch;
865878 scanner.offset < - offset;
879+ scanner.offset16 < - offset16;
866880 SingleQuote ))
867881 | '!' -> (
868882 match (peek scanner, peek2 scanner) with
0 commit comments