88 "strconv"
99 "strings"
1010 "time"
11+ "unicode/utf8"
1112
1213 stats "github.com/segmentio/stats/v5"
1314)
@@ -27,8 +28,12 @@ func (s *serializer) Write(b []byte) (int, error) {
2728 return 0 , io .ErrClosedPipe
2829 }
2930
30- // Ensure the serialized metric payload has valid UTF-8 encoded bytes
31- b = bytes .ToValidUTF8 (b , []byte ("\uFFFD " ))
31+ // Ensure the serialized metric payload has valid UTF-8 encoded bytes.
32+ // Because ToValidUTF8 makes a copy make one pass through to ensure we
33+ // actually need to change anything.
34+ if ! utf8 .Valid (b ) {
35+ b = bytes .ToValidUTF8 (b , []byte ("\uFFFD " ))
36+ }
3237 if len (b ) <= s .bufferSize {
3338 return s .conn .Write (b )
3439 }
@@ -207,86 +212,105 @@ const (
207212 maxLen = 250 // guard for the StatsD UDP packet size
208213)
209214
210- // isTrim returns true if the byte is to be trimmed at the ends.
211- func isTrim (b byte ) bool { return b == '.' || b == '_' || b == '-' }
215+ var shouldTrim [256 ]bool = [256 ]bool {
216+ '.' : true ,
217+ '_' : true ,
218+ '-' : true ,
219+ }
212220
213221// appendSanitizedMetricName converts *any* string into something that StatsD / Graphite
214222// accepts without complaints.
215223func appendSanitizedMetricName (dst []byte , raw string ) []byte {
216- nameLen := 0
217- orig := len (dst )
218224 if raw == "" {
219225 if len (dst ) == 0 {
220226 return append (dst , "_unnamed_" ... )
221227 }
222228 return dst
223229 }
224- // ── 1. accent folding (creates one temporary ↴)
225- // tmp := stripUnicodeAccents([]byte(raw))
226-
227- // ── 2. run the same ASCII sanitizer, but write into dst
228- lastWasRepl := false
229- for i := 0 ; i < len (raw ); i ++ {
230- c := byte (raw [i ])
231-
232- if c < 128 && valid [c ] {
233- // ASCII valid chars
234- dst = append (dst , c )
235- nameLen ++
236- lastWasRepl = false
237- } else if c >= 0xC2 && c <= 0xC3 && i + 1 < len (raw ) {
238- // Check for 2-byte UTF-8 sequences that are common accented letters
239- c2 := byte (raw [i + 1 ])
240- if c2 >= 0x80 && c2 <= 0xBF { // Valid second byte
241- // Decode the 2-byte sequence
242- codepoint := uint16 (c & 0x1F )<< 6 | uint16 (c2 & 0x3F )
243-
244- // Map common accented characters (U+00C0-U+00FF range)
245- if codepoint >= 0xC0 && codepoint <= 0xFF {
246- mapped := accentMap [codepoint ]
247- if valid [mapped ] {
230+ orig := len (dst )
231+
232+ // Pre-grow
233+ need := len (raw )
234+ if need > maxLen {
235+ need = maxLen
236+ }
237+ if cap (dst )- len (dst ) < need {
238+ nd := make ([]byte , len (dst ), len (dst )+ need )
239+ copy (nd , dst )
240+ dst = nd
241+ }
242+
243+ n := len (raw )
244+ i := 0
245+ lastWasReplacement := false
246+
247+ // Skip leading trim while building
248+ for i < n {
249+ c := raw [i ]
250+ if ! shouldTrim [c ] {
251+ break
252+ }
253+ i ++
254+ }
255+
256+ for i < n && (len (dst )- orig ) < maxLen {
257+ // Batch ASCII-valid run
258+ remaining := maxLen - (len (dst ) - orig )
259+ j := i
260+ limit := i + remaining
261+ if limit > n {
262+ limit = n
263+ }
264+ for j < limit {
265+ c := raw [j ]
266+ if c >= 128 || ! valid [c ] {
267+ break
268+ }
269+ j ++
270+ }
271+ if j > i {
272+ dst = append (dst , raw [i :j ]... )
273+ lastWasReplacement = false
274+ i = j
275+ continue
276+ }
277+
278+ // 2-byte common accent folding
279+ c0 := raw [i ]
280+ if c0 >= 0xC2 && c0 <= 0xC3 && i + 1 < n {
281+ c1 := raw [i + 1 ]
282+ if c1 >= 0x80 && c1 <= 0xBF {
283+ code := uint16 (c0 & 0x1F )<< 6 | uint16 (c1 & 0x3F )
284+ if code >= 0xC0 && code <= 0xFF {
285+ mapped := accentMap [code ]
286+ if valid [mapped ] && (len (dst )- orig ) < maxLen {
248287 dst = append (dst , mapped )
249- nameLen ++
250- lastWasRepl = false
251- i ++ // Skip the second byte
288+ lastWasReplacement = false
289+ i += 2
252290 continue
253291 }
254292 }
255293 }
256- // If we get here, treat as invalid
257- if ! lastWasRepl {
258- dst = append (dst , replacement )
259- nameLen ++
260- lastWasRepl = true
261- }
262- } else if ! lastWasRepl {
263- // Everything else (3-byte, 4-byte sequences, invalid chars)
264- dst = append (dst , replacement )
265- nameLen ++
266- lastWasRepl = true
267294 }
268295
269- if nameLen >= maxLen {
270- break
296+ // Replacement for everything else
297+ if ! lastWasReplacement && len (dst ) > orig && (len (dst )- orig ) < maxLen {
298+ dst = append (dst , replacement )
299+ lastWasReplacement = true
271300 }
301+ i ++
272302 }
273303
274- // 3. trim leading / trailing '.', '_' or '-'
275- start , end := orig , len (dst )
276- for start < end && isTrim (dst [start ]) {
277- start ++
278- }
279- for end > start && isTrim (dst [end - 1 ]) {
280- end --
281- }
282-
283- // 4. compact if we trimmed something
284- if start > orig || end < len (dst ) {
285- copy (dst [orig :], dst [start :end ])
286- dst = dst [:orig + (end - start )]
304+ // Trim trailing '.' '_' '-'
305+ for l := len (dst ); l > orig ; {
306+ c := dst [l - 1 ]
307+ if ! shouldTrim [c ] {
308+ break
309+ }
310+ l --
311+ dst = dst [:l ]
287312 }
288313
289- // 5. fallback if everything vanished
290314 if len (dst ) == orig {
291315 return append (dst , "_truncated_" ... )
292316 }
0 commit comments