@@ -3,7 +3,7 @@ mod codegen;
33use pg_query:: protobuf:: { KeywordKind , ScanToken } ;
44use regex:: Regex ;
55use std:: { collections:: VecDeque , sync:: LazyLock } ;
6- use text_size:: { TextRange , TextSize } ;
6+ use text_size:: { TextLen , TextRange , TextSize } ;
77
88pub use crate :: codegen:: SyntaxKind ;
99
@@ -119,25 +119,24 @@ pub fn lex(text: &str) -> Vec<Token> {
119119
120120 // merge the two token lists
121121 let mut tokens: Vec < Token > = Vec :: new ( ) ;
122- let mut pos = 0 ;
122+ let mut pos = TextSize :: from ( 0 ) ;
123123
124- while pos < text. len ( ) {
125- if !pg_query_tokens. is_empty ( ) && pg_query_tokens[ 0 ] . start == i32:: try_from ( pos) . unwrap ( ) {
124+ while pos < text. text_len ( ) {
125+ if !pg_query_tokens. is_empty ( )
126+ && TextSize :: from ( u32:: try_from ( pg_query_tokens[ 0 ] . start ) . unwrap ( ) ) == pos
127+ {
126128 let pg_query_token = pg_query_tokens. pop_front ( ) . unwrap ( ) ;
127- let token_text: String = text
128- . chars ( )
129- . skip ( usize:: try_from ( pg_query_token. start ) . unwrap ( ) )
130- . take (
131- usize:: try_from ( pg_query_token. end ) . unwrap ( )
132- - usize:: try_from ( pg_query_token. start ) . unwrap ( ) ,
133- )
134- . collect ( ) ;
135- let len = token_text. len ( ) ;
129+
130+ // the lexer returns byte indices, so we need to slice
131+ let token_text = & text[ usize:: try_from ( pg_query_token. start ) . unwrap ( )
132+ ..usize:: try_from ( pg_query_token. end ) . unwrap ( ) ] ;
133+
134+ let len = token_text. text_len ( ) ;
136135 let has_whitespace = token_text. contains ( " " ) || token_text. contains ( "\n " ) ;
137136 tokens. push ( Token {
138137 token_type : TokenType :: from ( & pg_query_token) ,
139138 kind : SyntaxKind :: from ( & pg_query_token) ,
140- text : token_text,
139+ text : token_text. to_string ( ) ,
141140 span : TextRange :: new (
142141 TextSize :: from ( u32:: try_from ( pg_query_token. start ) . unwrap ( ) ) ,
143142 TextSize :: from ( u32:: try_from ( pg_query_token. end ) . unwrap ( ) ) ,
@@ -147,8 +146,7 @@ pub fn lex(text: &str) -> Vec<Token> {
147146
148147 if has_whitespace {
149148 while !whitespace_tokens. is_empty ( )
150- && whitespace_tokens[ 0 ] . span . start ( )
151- < TextSize :: from ( u32:: try_from ( pos) . unwrap ( ) )
149+ && whitespace_tokens[ 0 ] . span . start ( ) < TextSize :: from ( u32:: from ( pos) )
152150 {
153151 whitespace_tokens. pop_front ( ) ;
154152 }
@@ -158,16 +156,21 @@ pub fn lex(text: &str) -> Vec<Token> {
158156 }
159157
160158 if !whitespace_tokens. is_empty ( )
161- && whitespace_tokens[ 0 ] . span . start ( ) == TextSize :: from ( u32:: try_from ( pos) . unwrap ( ) )
159+ && whitespace_tokens[ 0 ] . span . start ( ) == TextSize :: from ( u32:: from ( pos) )
162160 {
163161 let whitespace_token = whitespace_tokens. pop_front ( ) . unwrap ( ) ;
164- let len = whitespace_token. text . len ( ) ;
162+ let len = whitespace_token. text . text_len ( ) ;
165163 tokens. push ( whitespace_token) ;
166164 pos += len;
167165 continue ;
168166 }
169167
170- panic ! ( "No token found at position {}" , pos) ;
168+ let usize_pos = usize:: from ( pos) ;
169+ panic ! (
170+ "No token found at position {:?}: '{:?}'" ,
171+ pos,
172+ text. get( usize_pos..usize_pos + 1 )
173+ ) ;
171174 }
172175
173176 tokens
@@ -177,6 +180,13 @@ pub fn lex(text: &str) -> Vec<Token> {
177180mod tests {
178181 use super :: * ;
179182
183+ #[ test]
184+ fn test_special_chars ( ) {
185+ let input = "insert into c (name, full_name) values ('Å', 1);" ;
186+ let tokens = lex ( input) ;
187+ assert ! ( !tokens. is_empty( ) ) ;
188+ }
189+
180190 #[ test]
181191 fn test_tab_tokens ( ) {
182192 let input = "select\t 1" ;
0 commit comments