package main import ( "bufio" "bytes" "crypto/aes" "crypto/cipher" "crypto/sha256" "encoding/base64" "encoding/json" "fmt" "os" "path/filepath" "strconv" "strings" ) type fileStats struct { Path string LineCount int NonEmptyCount int Lengths map[int]int MinLen int MaxLen int MaxDataLen int ProbablyBase64 bool // JSON-object stats (only when ProbablyBase64 == false) JSONObjects int JSONParseErrors int MinFieldCount int MaxFieldCount int AllKeysSame bool AllFieldCountsSame bool SigOverflow bool SigCounts map[string]int SigExampleKeys map[string][]string } const ( colorReset = "\x1b[0m" colorRed = "\x1b[31m" colorGreen = "\x1b[32m" colorYellow = "\x1b[33m" colorCyan = "\x1b[36m" colorBold = "\x1b[1m" ) func main() { if len(os.Args) < 2 { fmt.Printf("%sUsage:%s realloc \n", colorCyan, colorReset) os.Exit(1) } dir := os.Args[1] if err := run(dir); err != nil { fmt.Fprintf(os.Stderr, "%sError:%s %v\n", colorRed, colorReset, err) os.Exit(1) } } func run(dir string) error { info, err := os.Stat(dir) if err != nil { return fmt.Errorf("stat dir: %w", err) } if !info.IsDir() { return fmt.Errorf("%s is not a directory", dir) } files, err := filepath.Glob(filepath.Join(dir, "*.jsonl")) if err != nil { return fmt.Errorf("glob: %w", err) } if len(files) == 0 { fmt.Printf("%sNo *.jsonl files found in%s %s\n", colorYellow, colorReset, dir) return nil } fmt.Printf("%sFound JSONL files:%s\n", colorCyan, colorReset) for i, f := range files { fmt.Printf(" [%d] %s\n", i+1, filepath.Base(f)) } reader := bufio.NewReader(os.Stdin) fmt.Printf("\n%sEnter file index or name (empty to cancel): %s", colorCyan, colorReset) input, _ := reader.ReadString('\n') input = strings.TrimSpace(input) if input == "" { fmt.Printf("%sCancelled.%s\n", colorYellow, colorReset) return nil } var path string if idx, err := strconv.Atoi(input); err == nil && idx >= 1 && idx <= len(files) { path = files[idx-1] } else { // try match by name for _, f := range files { if filepath.Base(f) == input { path = f break } } if path == "" { return fmt.Errorf("file %q not found in %s", input, dir) } } stats, err := analyzeFile(path) if err != nil { return err } // Если файл похож на закодированный — спросим ключ и попробуем расшифровать для статистики по полям if stats.ProbablyBase64 { fmt.Printf("\n%sFile looks encoded. If it was encrypted with Encode/EncodeKey, you can enter the key to analyze JSON fields.%s\n", colorYellow, colorReset) fmt.Printf("%sEncode key (press Enter to skip): %s", colorCyan, colorReset) keyLine, _ := reader.ReadString('\n') keyLine = strings.TrimSpace(keyLine) if keyLine != "" { if err := fillDecryptedJSONStats(path, keyLine, stats); err != nil { fmt.Fprintf(os.Stderr, "%sDecrypt/JSON analyze error:%s %v\n", colorRed, colorReset, err) } } } else { // Плоский JSON — сразу считаем статистику полей if err := fillDecryptedJSONStats(path, "", stats); err != nil { fmt.Fprintf(os.Stderr, "%sJSON analyze error:%s %v\n", colorRed, colorReset, err) } } printStats(stats) // Suggest allocSize recommendedAlloc := stats.MaxLen + 1 fmt.Printf("\n%sSuggested allocSize%s (from max line length %d): %s%d%s\n", colorCyan, colorReset, stats.MaxLen, colorGreen, recommendedAlloc, colorReset) fmt.Printf("%sEnter new allocSize%s (empty to skip, 0 to use suggested): ", colorCyan, colorReset) line, _ := reader.ReadString('\n') line = strings.TrimSpace(line) if line == "" { fmt.Printf("%sSkipped realloc.%s\n", colorYellow, colorReset) return nil } var targetAlloc int if line == "0" { targetAlloc = recommendedAlloc } else { val, err := strconv.Atoi(line) if err != nil || val < 2 { return fmt.Errorf("invalid allocSize: %q", line) } targetAlloc = val } if err := reallocFile(path, stats, targetAlloc); err != nil { return err } fmt.Printf("%sRealloc completed.%s File updated: %s\n", colorGreen, colorReset, path) return nil } func analyzeFile(path string) (*fileStats, error) { data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("read file: %w", err) } lines := strings.Split(string(data), "\n") stats := &fileStats{ Path: path, LineCount: len(lines), Lengths: make(map[int]int), MinLen: 0, MaxLen: 0, MaxDataLen: 0, SigCounts: make(map[string]int), SigExampleKeys: make(map[string][]string), } var nonEmpty []string for _, l := range lines { if l == "" { continue } nonEmpty = append(nonEmpty, l) } stats.NonEmptyCount = len(nonEmpty) if len(nonEmpty) == 0 { return stats, nil } // эвристика: если у значащих данных нет { ... } по краям, считаем, что это закодированное содержимое encodedCandidates := 0 for _, l := range nonEmpty { n := len(l) stats.Lengths[n]++ if stats.MinLen == 0 || n < stats.MinLen { stats.MinLen = n } if n > stats.MaxLen { stats.MaxLen = n } trimmed := strings.TrimRight(l, " ") if len(trimmed) > stats.MaxDataLen { stats.MaxDataLen = len(trimmed) } // Значащие данные без { } по краям — считаем как "encoded" core := strings.TrimSpace(l) if core != "" && !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) { encodedCandidates++ } } if stats.NonEmptyCount > 0 && encodedCandidates >= stats.NonEmptyCount/2 { stats.ProbablyBase64 = true } // Доп.статистика по JSON-полям — только если это не похоже на закодированные строки if !stats.ProbablyBase64 { stats.MinFieldCount = 0 stats.MaxFieldCount = 0 stats.AllKeysSame = true stats.AllFieldCountsSame = true var refSig string var refCount int for _, l := range nonEmpty { core := strings.TrimSpace(strings.TrimRight(l, " ")) if core == "" { continue } // ожидаем JSON object if !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) { stats.JSONParseErrors++ continue } var obj map[string]any if err := json.Unmarshal([]byte(core), &obj); err != nil { stats.JSONParseErrors++ continue } stats.JSONObjects++ cnt := len(obj) if stats.MinFieldCount == 0 || cnt < stats.MinFieldCount { stats.MinFieldCount = cnt } if cnt > stats.MaxFieldCount { stats.MaxFieldCount = cnt } keys := make([]string, 0, cnt) for k := range obj { keys = append(keys, k) } sortStrings(keys) sig := strings.Join(keys, "\x1f") // case-sensitive if refSig == "" { refSig = sig refCount = cnt } else { if sig != refSig { stats.AllKeysSame = false } if cnt != refCount { stats.AllFieldCountsSame = false } } // collect limited signature stats if _, ok := stats.SigCounts[sig]; ok || len(stats.SigCounts) < 10 { stats.SigCounts[sig]++ if _, ok := stats.SigExampleKeys[sig]; !ok { stats.SigExampleKeys[sig] = keys } } else { stats.SigOverflow = true } } // If no objects parsed, keep flags meaningful if stats.JSONObjects == 0 { stats.AllKeysSame = false stats.AllFieldCountsSame = false } else if stats.MinFieldCount != stats.MaxFieldCount { stats.AllFieldCountsSame = false } } return stats, nil } // encodeKeyBytes превращает строку ключа в 32 байта (SHA256) для AES-256 func encodeKeyBytes(keyStr string) []byte { h := sha256.Sum256([]byte(keyStr)) return h[:] } // aesGCMDecrypt — расшифровка AES-256-GCM (nonce||ciphertext) func aesGCMDecrypt(ciphertext, key []byte) ([]byte, error) { block, err := aes.NewCipher(key) if err != nil { return nil, err } aesgcm, err := cipher.NewGCM(block) if err != nil { return nil, err } nonceSize := aesgcm.NonceSize() if len(ciphertext) < nonceSize { return nil, fmt.Errorf("ciphertext too short") } nonce, data := ciphertext[:nonceSize], ciphertext[nonceSize:] return aesgcm.Open(nil, nonce, data, nil) } // fillDecryptedJSONStats заполняет JSON-статистику. // Если keyStr == "", предполагается, что строки содержат обычный JSON. // Если keyStr != "", строки считаются base64(AES-GCM(JSON)). func fillDecryptedJSONStats(path, keyStr string, stats *fileStats) error { data, err := os.ReadFile(path) if err != nil { return fmt.Errorf("read file: %w", err) } lines := strings.Split(strings.TrimSuffix(string(data), "\n"), "\n") // сбрасываем предыдущие JSON-статы stats.JSONObjects = 0 stats.JSONParseErrors = 0 stats.MinFieldCount = 0 stats.MaxFieldCount = 0 stats.AllKeysSame = true stats.AllFieldCountsSame = true stats.SigOverflow = false stats.SigCounts = make(map[string]int) stats.SigExampleKeys = make(map[string][]string) var key []byte if keyStr != "" { key = encodeKeyBytes(keyStr) } var refSig string var refCount int for _, l := range lines { if l == "" { continue } raw := strings.TrimRight(l, " ") if raw == "" { continue } var payload []byte if key != nil { enc, err := base64.StdEncoding.DecodeString(strings.TrimSpace(raw)) if err != nil { stats.JSONParseErrors++ continue } plain, err := aesGCMDecrypt(enc, key) if err != nil { stats.JSONParseErrors++ continue } payload = plain } else { payload = []byte(strings.TrimSpace(raw)) } core := strings.TrimSpace(string(payload)) if core == "" { continue } if !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) { stats.JSONParseErrors++ continue } var obj map[string]any if err := json.Unmarshal([]byte(core), &obj); err != nil { stats.JSONParseErrors++ continue } stats.JSONObjects++ cnt := len(obj) if stats.MinFieldCount == 0 || cnt < stats.MinFieldCount { stats.MinFieldCount = cnt } if cnt > stats.MaxFieldCount { stats.MaxFieldCount = cnt } keys := make([]string, 0, cnt) for k := range obj { keys = append(keys, k) } sortStrings(keys) sig := strings.Join(keys, "\x1f") if refSig == "" { refSig = sig refCount = cnt } else { if sig != refSig { stats.AllKeysSame = false } if cnt != refCount { stats.AllFieldCountsSame = false } } if _, ok := stats.SigCounts[sig]; ok || len(stats.SigCounts) < 10 { stats.SigCounts[sig]++ if _, ok := stats.SigExampleKeys[sig]; !ok { stats.SigExampleKeys[sig] = keys } } else { stats.SigOverflow = true } } if stats.JSONObjects == 0 { stats.AllKeysSame = false stats.AllFieldCountsSame = false } else if stats.MinFieldCount != stats.MaxFieldCount { stats.AllFieldCountsSame = false } return nil } func printStats(s *fileStats) { fmt.Printf("\n%sStats for%s %s%s%s\n", colorBold, colorReset, colorCyan, s.Path, colorReset) fmt.Printf(" Total lines: %d\n", s.LineCount) fmt.Printf(" Non-empty lines: %d\n", s.NonEmptyCount) fmt.Printf(" Min line length: %d\n", s.MinLen) fmt.Printf(" Max line length: %d\n", s.MaxLen) fmt.Printf(" Max data length: %d (without trailing spaces)\n", s.MaxDataLen) fmt.Printf(" Unique lengths: ") first := true for l, c := range s.Lengths { if !first { fmt.Print(", ") } first = false fmt.Printf("%d(%d)", l, c) } fmt.Println() if s.ProbablyBase64 { fmt.Printf(" Encoded: %slikely encoded%s (no { } around data)\n", colorYellow, colorReset) // Если удалось расшифровать/распарсить JSON — покажем статистику по полям if s.JSONObjects > 0 || s.JSONParseErrors > 0 { fmt.Printf(" Decoded JSON: %s%s%s\n", colorCyan, "analysis", colorReset) fmt.Printf(" JSON objects: %d\n", s.JSONObjects) if s.JSONParseErrors > 0 { fmt.Printf(" JSON errors: %s%d%s\n", colorYellow, s.JSONParseErrors, colorReset) } else { fmt.Printf(" JSON errors: %s0%s\n", colorGreen, colorReset) } if s.JSONObjects > 0 { fmt.Printf(" Fields per rec: min=%d max=%d\n", s.MinFieldCount, s.MaxFieldCount) if s.AllFieldCountsSame { fmt.Printf(" Field counts: %sall equal%s\n", colorGreen, colorReset) } else { fmt.Printf(" Field counts: %sNOT equal%s\n", colorRed, colorReset) } if s.AllKeysSame { fmt.Printf(" Field names: %sall equal%s\n", colorGreen, colorReset) } else { fmt.Printf(" Field names: %sNOT equal%s\n", colorRed, colorReset) } if len(s.SigCounts) > 0 { schemaCount := len(s.SigCounts) extra := "" if s.SigOverflow { extra = " (showing first 10)" } color := colorGreen if schemaCount > 1 { color = colorYellow } fmt.Printf(" Schemas: %s%d%s%s\n", color, schemaCount, colorReset, extra) type kv struct { sig string count int } var list []kv for sig, c := range s.SigCounts { list = append(list, kv{sig: sig, count: c}) } if len(list) > 0 { ref := list[0] refKeys := s.SigExampleKeys[ref.sig] fmt.Printf(" Ref schema keys (%s%d%s recs): %v\n", colorCyan, ref.count, colorReset, refKeys) for _, kv := range list[1:] { if kv.sig != ref.sig { diffKeys := s.SigExampleKeys[kv.sig] fmt.Printf(" Diff schema keys (%s%d%s recs): %v\n", colorYellow, kv.count, colorReset, diffKeys) break } } } } } } } else { fmt.Printf(" Encoded: %slooks like plain JSON%s\n", colorGreen, colorReset) fmt.Printf(" JSON objects: %d\n", s.JSONObjects) if s.JSONParseErrors > 0 { fmt.Printf(" JSON errors: %s%d%s\n", colorYellow, s.JSONParseErrors, colorReset) } else { fmt.Printf(" JSON errors: %s0%s\n", colorGreen, colorReset) } if s.JSONObjects > 0 { fmt.Printf(" Fields per rec: min=%d max=%d\n", s.MinFieldCount, s.MaxFieldCount) if s.AllFieldCountsSame { fmt.Printf(" Field counts: %sall equal%s\n", colorGreen, colorReset) } else { fmt.Printf(" Field counts: %sNOT equal%s\n", colorRed, colorReset) } if s.AllKeysSame { fmt.Printf(" Field names: %sall equal%s\n", colorGreen, colorReset) } else { fmt.Printf(" Field names: %sNOT equal%s\n", colorRed, colorReset) } if len(s.SigCounts) > 0 { schemaCount := len(s.SigCounts) extra := "" if s.SigOverflow { extra = " (showing first 10)" } color := colorGreen if schemaCount > 1 { color = colorYellow } fmt.Printf(" Schemas: %s%d%s%s\n", color, schemaCount, colorReset, extra) // Показать пример эталонной и отличающейся схем type kv struct { sig string count int } var list []kv for sig, c := range s.SigCounts { list = append(list, kv{sig: sig, count: c}) } // не сортируем по алфавиту, просто первый будет эталоном if len(list) > 0 { ref := list[0] refKeys := s.SigExampleKeys[ref.sig] fmt.Printf(" Ref schema keys (%s%d%s recs): %v\n", colorCyan, ref.count, colorReset, refKeys) // найдём первую отличающуюся схему for _, kv := range list[1:] { if kv.sig != ref.sig { diffKeys := s.SigExampleKeys[kv.sig] fmt.Printf(" Diff schema keys (%s%d%s recs): %v\n", colorYellow, kv.count, colorReset, diffKeys) break } } } } } } } func sortStrings(a []string) { // simple insertion sort to avoid extra deps for i := 1; i < len(a); i++ { j := i for j > 0 && a[j-1] > a[j] { a[j-1], a[j] = a[j], a[j-1] j-- } } } func reallocFile(path string, stats *fileStats, newAlloc int) error { if stats.NonEmptyCount == 0 { fmt.Printf("%sFile is empty, nothing to realloc.%s\n", colorYellow, colorReset) return nil } targetLen := newAlloc - 1 if targetLen < 1 { return fmt.Errorf("target allocSize too small") } if stats.MaxDataLen > targetLen { return fmt.Errorf("cannot set allocSize=%d: max data length %d would be truncated", newAlloc, stats.MaxDataLen) } data, err := os.ReadFile(path) if err != nil { return fmt.Errorf("read file: %w", err) } lines := strings.Split(strings.TrimSuffix(string(data), "\n"), "\n") var buf bytes.Buffer for _, l := range lines { if l == "" { continue } trimmed := strings.TrimRight(l, " ") if trimmed == "" { // строка содержит только пробелы — считаем удалённой и пропускаем continue } if len(trimmed) > targetLen { return fmt.Errorf("line data length %d exceeds targetLen %d", len(trimmed), targetLen) } padded := trimmed + strings.Repeat(" ", targetLen-len(trimmed)) buf.WriteString(padded) buf.WriteByte('\n') } // backup backup := path + ".bak" if err := os.WriteFile(backup, data, 0644); err != nil { return fmt.Errorf("write backup: %w", err) } if err := os.WriteFile(path, buf.Bytes(), 0644); err != nil { return fmt.Errorf("write new file: %w", err) } fmt.Printf("%sBackup saved%s to %s\n", colorGreen, colorReset, backup) return nil }