before merge to main

This commit is contained in:
2026-03-04 16:29:24 +06:00
parent 0481cde1c3
commit 491ccbea89
21 changed files with 6776 additions and 47 deletions

639
tools/realloc/main.go Normal file
View File

@@ -0,0 +1,639 @@
package main
import (
"bufio"
"bytes"
"crypto/aes"
"crypto/cipher"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
)
type fileStats struct {
Path string
LineCount int
NonEmptyCount int
Lengths map[int]int
MinLen int
MaxLen int
MaxDataLen int
ProbablyBase64 bool
// JSON-object stats (only when ProbablyBase64 == false)
JSONObjects int
JSONParseErrors int
MinFieldCount int
MaxFieldCount int
AllKeysSame bool
AllFieldCountsSame bool
SigOverflow bool
SigCounts map[string]int
SigExampleKeys map[string][]string
}
const (
colorReset = "\x1b[0m"
colorRed = "\x1b[31m"
colorGreen = "\x1b[32m"
colorYellow = "\x1b[33m"
colorCyan = "\x1b[36m"
colorBold = "\x1b[1m"
)
func main() {
if len(os.Args) < 2 {
fmt.Printf("%sUsage:%s realloc <directory-with-jsonl-files>\n", colorCyan, colorReset)
os.Exit(1)
}
dir := os.Args[1]
if err := run(dir); err != nil {
fmt.Fprintf(os.Stderr, "%sError:%s %v\n", colorRed, colorReset, err)
os.Exit(1)
}
}
func run(dir string) error {
info, err := os.Stat(dir)
if err != nil {
return fmt.Errorf("stat dir: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("%s is not a directory", dir)
}
files, err := filepath.Glob(filepath.Join(dir, "*.jsonl"))
if err != nil {
return fmt.Errorf("glob: %w", err)
}
if len(files) == 0 {
fmt.Printf("%sNo *.jsonl files found in%s %s\n", colorYellow, colorReset, dir)
return nil
}
fmt.Printf("%sFound JSONL files:%s\n", colorCyan, colorReset)
for i, f := range files {
fmt.Printf(" [%d] %s\n", i+1, filepath.Base(f))
}
reader := bufio.NewReader(os.Stdin)
fmt.Printf("\n%sEnter file index or name (empty to cancel): %s", colorCyan, colorReset)
input, _ := reader.ReadString('\n')
input = strings.TrimSpace(input)
if input == "" {
fmt.Printf("%sCancelled.%s\n", colorYellow, colorReset)
return nil
}
var path string
if idx, err := strconv.Atoi(input); err == nil && idx >= 1 && idx <= len(files) {
path = files[idx-1]
} else {
// try match by name
for _, f := range files {
if filepath.Base(f) == input {
path = f
break
}
}
if path == "" {
return fmt.Errorf("file %q not found in %s", input, dir)
}
}
stats, err := analyzeFile(path)
if err != nil {
return err
}
// Если файл похож на закодированный — спросим ключ и попробуем расшифровать для статистики по полям
if stats.ProbablyBase64 {
fmt.Printf("\n%sFile looks encoded. If it was encrypted with Encode/EncodeKey, you can enter the key to analyze JSON fields.%s\n", colorYellow, colorReset)
fmt.Printf("%sEncode key (press Enter to skip): %s", colorCyan, colorReset)
keyLine, _ := reader.ReadString('\n')
keyLine = strings.TrimSpace(keyLine)
if keyLine != "" {
if err := fillDecryptedJSONStats(path, keyLine, stats); err != nil {
fmt.Fprintf(os.Stderr, "%sDecrypt/JSON analyze error:%s %v\n", colorRed, colorReset, err)
}
}
} else {
// Плоский JSON — сразу считаем статистику полей
if err := fillDecryptedJSONStats(path, "", stats); err != nil {
fmt.Fprintf(os.Stderr, "%sJSON analyze error:%s %v\n", colorRed, colorReset, err)
}
}
printStats(stats)
// Suggest allocSize
recommendedAlloc := stats.MaxLen + 1
fmt.Printf("\n%sSuggested allocSize%s (from max line length %d): %s%d%s\n",
colorCyan, colorReset, stats.MaxLen, colorGreen, recommendedAlloc, colorReset)
fmt.Printf("%sEnter new allocSize%s (empty to skip, 0 to use suggested): ", colorCyan, colorReset)
line, _ := reader.ReadString('\n')
line = strings.TrimSpace(line)
if line == "" {
fmt.Printf("%sSkipped realloc.%s\n", colorYellow, colorReset)
return nil
}
var targetAlloc int
if line == "0" {
targetAlloc = recommendedAlloc
} else {
val, err := strconv.Atoi(line)
if err != nil || val < 2 {
return fmt.Errorf("invalid allocSize: %q", line)
}
targetAlloc = val
}
if err := reallocFile(path, stats, targetAlloc); err != nil {
return err
}
fmt.Printf("%sRealloc completed.%s File updated: %s\n", colorGreen, colorReset, path)
return nil
}
func analyzeFile(path string) (*fileStats, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read file: %w", err)
}
lines := strings.Split(string(data), "\n")
stats := &fileStats{
Path: path,
LineCount: len(lines),
Lengths: make(map[int]int),
MinLen: 0,
MaxLen: 0,
MaxDataLen: 0,
SigCounts: make(map[string]int),
SigExampleKeys: make(map[string][]string),
}
var nonEmpty []string
for _, l := range lines {
if l == "" {
continue
}
nonEmpty = append(nonEmpty, l)
}
stats.NonEmptyCount = len(nonEmpty)
if len(nonEmpty) == 0 {
return stats, nil
}
// эвристика: если у значащих данных нет { ... } по краям, считаем, что это закодированное содержимое
encodedCandidates := 0
for _, l := range nonEmpty {
n := len(l)
stats.Lengths[n]++
if stats.MinLen == 0 || n < stats.MinLen {
stats.MinLen = n
}
if n > stats.MaxLen {
stats.MaxLen = n
}
trimmed := strings.TrimRight(l, " ")
if len(trimmed) > stats.MaxDataLen {
stats.MaxDataLen = len(trimmed)
}
// Значащие данные без { } по краям — считаем как "encoded"
core := strings.TrimSpace(l)
if core != "" && !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) {
encodedCandidates++
}
}
if stats.NonEmptyCount > 0 && encodedCandidates >= stats.NonEmptyCount/2 {
stats.ProbablyBase64 = true
}
// Доп.статистика по JSON-полям — только если это не похоже на закодированные строки
if !stats.ProbablyBase64 {
stats.MinFieldCount = 0
stats.MaxFieldCount = 0
stats.AllKeysSame = true
stats.AllFieldCountsSame = true
var refSig string
var refCount int
for _, l := range nonEmpty {
core := strings.TrimSpace(strings.TrimRight(l, " "))
if core == "" {
continue
}
// ожидаем JSON object
if !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) {
stats.JSONParseErrors++
continue
}
var obj map[string]any
if err := json.Unmarshal([]byte(core), &obj); err != nil {
stats.JSONParseErrors++
continue
}
stats.JSONObjects++
cnt := len(obj)
if stats.MinFieldCount == 0 || cnt < stats.MinFieldCount {
stats.MinFieldCount = cnt
}
if cnt > stats.MaxFieldCount {
stats.MaxFieldCount = cnt
}
keys := make([]string, 0, cnt)
for k := range obj {
keys = append(keys, k)
}
sortStrings(keys)
sig := strings.Join(keys, "\x1f") // case-sensitive
if refSig == "" {
refSig = sig
refCount = cnt
} else {
if sig != refSig {
stats.AllKeysSame = false
}
if cnt != refCount {
stats.AllFieldCountsSame = false
}
}
// collect limited signature stats
if _, ok := stats.SigCounts[sig]; ok || len(stats.SigCounts) < 10 {
stats.SigCounts[sig]++
if _, ok := stats.SigExampleKeys[sig]; !ok {
stats.SigExampleKeys[sig] = keys
}
} else {
stats.SigOverflow = true
}
}
// If no objects parsed, keep flags meaningful
if stats.JSONObjects == 0 {
stats.AllKeysSame = false
stats.AllFieldCountsSame = false
} else if stats.MinFieldCount != stats.MaxFieldCount {
stats.AllFieldCountsSame = false
}
}
return stats, nil
}
// encodeKeyBytes превращает строку ключа в 32 байта (SHA256) для AES-256
func encodeKeyBytes(keyStr string) []byte {
h := sha256.Sum256([]byte(keyStr))
return h[:]
}
// aesGCMDecrypt — расшифровка AES-256-GCM (nonce||ciphertext)
func aesGCMDecrypt(ciphertext, key []byte) ([]byte, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
aesgcm, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
nonceSize := aesgcm.NonceSize()
if len(ciphertext) < nonceSize {
return nil, fmt.Errorf("ciphertext too short")
}
nonce, data := ciphertext[:nonceSize], ciphertext[nonceSize:]
return aesgcm.Open(nil, nonce, data, nil)
}
// fillDecryptedJSONStats заполняет JSON-статистику.
// Если keyStr == "", предполагается, что строки содержат обычный JSON.
// Если keyStr != "", строки считаются base64(AES-GCM(JSON)).
func fillDecryptedJSONStats(path, keyStr string, stats *fileStats) error {
data, err := os.ReadFile(path)
if err != nil {
return fmt.Errorf("read file: %w", err)
}
lines := strings.Split(strings.TrimSuffix(string(data), "\n"), "\n")
// сбрасываем предыдущие JSON-статы
stats.JSONObjects = 0
stats.JSONParseErrors = 0
stats.MinFieldCount = 0
stats.MaxFieldCount = 0
stats.AllKeysSame = true
stats.AllFieldCountsSame = true
stats.SigOverflow = false
stats.SigCounts = make(map[string]int)
stats.SigExampleKeys = make(map[string][]string)
var key []byte
if keyStr != "" {
key = encodeKeyBytes(keyStr)
}
var refSig string
var refCount int
for _, l := range lines {
if l == "" {
continue
}
raw := strings.TrimRight(l, " ")
if raw == "" {
continue
}
var payload []byte
if key != nil {
enc, err := base64.StdEncoding.DecodeString(strings.TrimSpace(raw))
if err != nil {
stats.JSONParseErrors++
continue
}
plain, err := aesGCMDecrypt(enc, key)
if err != nil {
stats.JSONParseErrors++
continue
}
payload = plain
} else {
payload = []byte(strings.TrimSpace(raw))
}
core := strings.TrimSpace(string(payload))
if core == "" {
continue
}
if !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) {
stats.JSONParseErrors++
continue
}
var obj map[string]any
if err := json.Unmarshal([]byte(core), &obj); err != nil {
stats.JSONParseErrors++
continue
}
stats.JSONObjects++
cnt := len(obj)
if stats.MinFieldCount == 0 || cnt < stats.MinFieldCount {
stats.MinFieldCount = cnt
}
if cnt > stats.MaxFieldCount {
stats.MaxFieldCount = cnt
}
keys := make([]string, 0, cnt)
for k := range obj {
keys = append(keys, k)
}
sortStrings(keys)
sig := strings.Join(keys, "\x1f")
if refSig == "" {
refSig = sig
refCount = cnt
} else {
if sig != refSig {
stats.AllKeysSame = false
}
if cnt != refCount {
stats.AllFieldCountsSame = false
}
}
if _, ok := stats.SigCounts[sig]; ok || len(stats.SigCounts) < 10 {
stats.SigCounts[sig]++
if _, ok := stats.SigExampleKeys[sig]; !ok {
stats.SigExampleKeys[sig] = keys
}
} else {
stats.SigOverflow = true
}
}
if stats.JSONObjects == 0 {
stats.AllKeysSame = false
stats.AllFieldCountsSame = false
} else if stats.MinFieldCount != stats.MaxFieldCount {
stats.AllFieldCountsSame = false
}
return nil
}
func printStats(s *fileStats) {
fmt.Printf("\n%sStats for%s %s%s%s\n", colorBold, colorReset, colorCyan, s.Path, colorReset)
fmt.Printf(" Total lines: %d\n", s.LineCount)
fmt.Printf(" Non-empty lines: %d\n", s.NonEmptyCount)
fmt.Printf(" Min line length: %d\n", s.MinLen)
fmt.Printf(" Max line length: %d\n", s.MaxLen)
fmt.Printf(" Max data length: %d (without trailing spaces)\n", s.MaxDataLen)
fmt.Printf(" Unique lengths: ")
first := true
for l, c := range s.Lengths {
if !first {
fmt.Print(", ")
}
first = false
fmt.Printf("%d(%d)", l, c)
}
fmt.Println()
if s.ProbablyBase64 {
fmt.Printf(" Encoded: %slikely encoded%s (no { } around data)\n", colorYellow, colorReset)
// Если удалось расшифровать/распарсить JSON — покажем статистику по полям
if s.JSONObjects > 0 || s.JSONParseErrors > 0 {
fmt.Printf(" Decoded JSON: %s%s%s\n", colorCyan, "analysis", colorReset)
fmt.Printf(" JSON objects: %d\n", s.JSONObjects)
if s.JSONParseErrors > 0 {
fmt.Printf(" JSON errors: %s%d%s\n", colorYellow, s.JSONParseErrors, colorReset)
} else {
fmt.Printf(" JSON errors: %s0%s\n", colorGreen, colorReset)
}
if s.JSONObjects > 0 {
fmt.Printf(" Fields per rec: min=%d max=%d\n", s.MinFieldCount, s.MaxFieldCount)
if s.AllFieldCountsSame {
fmt.Printf(" Field counts: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field counts: %sNOT equal%s\n", colorRed, colorReset)
}
if s.AllKeysSame {
fmt.Printf(" Field names: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field names: %sNOT equal%s\n", colorRed, colorReset)
}
if len(s.SigCounts) > 0 {
schemaCount := len(s.SigCounts)
extra := ""
if s.SigOverflow {
extra = " (showing first 10)"
}
color := colorGreen
if schemaCount > 1 {
color = colorYellow
}
fmt.Printf(" Schemas: %s%d%s%s\n", color, schemaCount, colorReset, extra)
type kv struct {
sig string
count int
}
var list []kv
for sig, c := range s.SigCounts {
list = append(list, kv{sig: sig, count: c})
}
if len(list) > 0 {
ref := list[0]
refKeys := s.SigExampleKeys[ref.sig]
fmt.Printf(" Ref schema keys (%s%d%s recs): %v\n", colorCyan, ref.count, colorReset, refKeys)
for _, kv := range list[1:] {
if kv.sig != ref.sig {
diffKeys := s.SigExampleKeys[kv.sig]
fmt.Printf(" Diff schema keys (%s%d%s recs): %v\n", colorYellow, kv.count, colorReset, diffKeys)
break
}
}
}
}
}
}
} else {
fmt.Printf(" Encoded: %slooks like plain JSON%s\n", colorGreen, colorReset)
fmt.Printf(" JSON objects: %d\n", s.JSONObjects)
if s.JSONParseErrors > 0 {
fmt.Printf(" JSON errors: %s%d%s\n", colorYellow, s.JSONParseErrors, colorReset)
} else {
fmt.Printf(" JSON errors: %s0%s\n", colorGreen, colorReset)
}
if s.JSONObjects > 0 {
fmt.Printf(" Fields per rec: min=%d max=%d\n", s.MinFieldCount, s.MaxFieldCount)
if s.AllFieldCountsSame {
fmt.Printf(" Field counts: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field counts: %sNOT equal%s\n", colorRed, colorReset)
}
if s.AllKeysSame {
fmt.Printf(" Field names: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field names: %sNOT equal%s\n", colorRed, colorReset)
}
if len(s.SigCounts) > 0 {
schemaCount := len(s.SigCounts)
extra := ""
if s.SigOverflow {
extra = " (showing first 10)"
}
color := colorGreen
if schemaCount > 1 {
color = colorYellow
}
fmt.Printf(" Schemas: %s%d%s%s\n", color, schemaCount, colorReset, extra)
// Показать пример эталонной и отличающейся схем
type kv struct {
sig string
count int
}
var list []kv
for sig, c := range s.SigCounts {
list = append(list, kv{sig: sig, count: c})
}
// не сортируем по алфавиту, просто первый будет эталоном
if len(list) > 0 {
ref := list[0]
refKeys := s.SigExampleKeys[ref.sig]
fmt.Printf(" Ref schema keys (%s%d%s recs): %v\n", colorCyan, ref.count, colorReset, refKeys)
// найдём первую отличающуюся схему
for _, kv := range list[1:] {
if kv.sig != ref.sig {
diffKeys := s.SigExampleKeys[kv.sig]
fmt.Printf(" Diff schema keys (%s%d%s recs): %v\n", colorYellow, kv.count, colorReset, diffKeys)
break
}
}
}
}
}
}
}
func sortStrings(a []string) {
// simple insertion sort to avoid extra deps
for i := 1; i < len(a); i++ {
j := i
for j > 0 && a[j-1] > a[j] {
a[j-1], a[j] = a[j], a[j-1]
j--
}
}
}
func reallocFile(path string, stats *fileStats, newAlloc int) error {
if stats.NonEmptyCount == 0 {
fmt.Printf("%sFile is empty, nothing to realloc.%s\n", colorYellow, colorReset)
return nil
}
targetLen := newAlloc - 1
if targetLen < 1 {
return fmt.Errorf("target allocSize too small")
}
if stats.MaxDataLen > targetLen {
return fmt.Errorf("cannot set allocSize=%d: max data length %d would be truncated", newAlloc, stats.MaxDataLen)
}
data, err := os.ReadFile(path)
if err != nil {
return fmt.Errorf("read file: %w", err)
}
lines := strings.Split(strings.TrimSuffix(string(data), "\n"), "\n")
var buf bytes.Buffer
for _, l := range lines {
if l == "" {
continue
}
trimmed := strings.TrimRight(l, " ")
if trimmed == "" {
// строка содержит только пробелы — считаем удалённой и пропускаем
continue
}
if len(trimmed) > targetLen {
return fmt.Errorf("line data length %d exceeds targetLen %d", len(trimmed), targetLen)
}
padded := trimmed + strings.Repeat(" ", targetLen-len(trimmed))
buf.WriteString(padded)
buf.WriteByte('\n')
}
// backup
backup := path + ".bak"
if err := os.WriteFile(backup, data, 0644); err != nil {
return fmt.Errorf("write backup: %w", err)
}
if err := os.WriteFile(path, buf.Bytes(), 0644); err != nil {
return fmt.Errorf("write new file: %w", err)
}
fmt.Printf("%sBackup saved%s to %s\n", colorGreen, colorReset, backup)
return nil
}