Files
elowdb-go/tools/realloc/main.go
2026-03-04 16:29:24 +06:00

640 lines
17 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"bufio"
"bytes"
"crypto/aes"
"crypto/cipher"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
)
type fileStats struct {
Path string
LineCount int
NonEmptyCount int
Lengths map[int]int
MinLen int
MaxLen int
MaxDataLen int
ProbablyBase64 bool
// JSON-object stats (only when ProbablyBase64 == false)
JSONObjects int
JSONParseErrors int
MinFieldCount int
MaxFieldCount int
AllKeysSame bool
AllFieldCountsSame bool
SigOverflow bool
SigCounts map[string]int
SigExampleKeys map[string][]string
}
const (
colorReset = "\x1b[0m"
colorRed = "\x1b[31m"
colorGreen = "\x1b[32m"
colorYellow = "\x1b[33m"
colorCyan = "\x1b[36m"
colorBold = "\x1b[1m"
)
func main() {
if len(os.Args) < 2 {
fmt.Printf("%sUsage:%s realloc <directory-with-jsonl-files>\n", colorCyan, colorReset)
os.Exit(1)
}
dir := os.Args[1]
if err := run(dir); err != nil {
fmt.Fprintf(os.Stderr, "%sError:%s %v\n", colorRed, colorReset, err)
os.Exit(1)
}
}
func run(dir string) error {
info, err := os.Stat(dir)
if err != nil {
return fmt.Errorf("stat dir: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("%s is not a directory", dir)
}
files, err := filepath.Glob(filepath.Join(dir, "*.jsonl"))
if err != nil {
return fmt.Errorf("glob: %w", err)
}
if len(files) == 0 {
fmt.Printf("%sNo *.jsonl files found in%s %s\n", colorYellow, colorReset, dir)
return nil
}
fmt.Printf("%sFound JSONL files:%s\n", colorCyan, colorReset)
for i, f := range files {
fmt.Printf(" [%d] %s\n", i+1, filepath.Base(f))
}
reader := bufio.NewReader(os.Stdin)
fmt.Printf("\n%sEnter file index or name (empty to cancel): %s", colorCyan, colorReset)
input, _ := reader.ReadString('\n')
input = strings.TrimSpace(input)
if input == "" {
fmt.Printf("%sCancelled.%s\n", colorYellow, colorReset)
return nil
}
var path string
if idx, err := strconv.Atoi(input); err == nil && idx >= 1 && idx <= len(files) {
path = files[idx-1]
} else {
// try match by name
for _, f := range files {
if filepath.Base(f) == input {
path = f
break
}
}
if path == "" {
return fmt.Errorf("file %q not found in %s", input, dir)
}
}
stats, err := analyzeFile(path)
if err != nil {
return err
}
// Если файл похож на закодированный — спросим ключ и попробуем расшифровать для статистики по полям
if stats.ProbablyBase64 {
fmt.Printf("\n%sFile looks encoded. If it was encrypted with Encode/EncodeKey, you can enter the key to analyze JSON fields.%s\n", colorYellow, colorReset)
fmt.Printf("%sEncode key (press Enter to skip): %s", colorCyan, colorReset)
keyLine, _ := reader.ReadString('\n')
keyLine = strings.TrimSpace(keyLine)
if keyLine != "" {
if err := fillDecryptedJSONStats(path, keyLine, stats); err != nil {
fmt.Fprintf(os.Stderr, "%sDecrypt/JSON analyze error:%s %v\n", colorRed, colorReset, err)
}
}
} else {
// Плоский JSON — сразу считаем статистику полей
if err := fillDecryptedJSONStats(path, "", stats); err != nil {
fmt.Fprintf(os.Stderr, "%sJSON analyze error:%s %v\n", colorRed, colorReset, err)
}
}
printStats(stats)
// Suggest allocSize
recommendedAlloc := stats.MaxLen + 1
fmt.Printf("\n%sSuggested allocSize%s (from max line length %d): %s%d%s\n",
colorCyan, colorReset, stats.MaxLen, colorGreen, recommendedAlloc, colorReset)
fmt.Printf("%sEnter new allocSize%s (empty to skip, 0 to use suggested): ", colorCyan, colorReset)
line, _ := reader.ReadString('\n')
line = strings.TrimSpace(line)
if line == "" {
fmt.Printf("%sSkipped realloc.%s\n", colorYellow, colorReset)
return nil
}
var targetAlloc int
if line == "0" {
targetAlloc = recommendedAlloc
} else {
val, err := strconv.Atoi(line)
if err != nil || val < 2 {
return fmt.Errorf("invalid allocSize: %q", line)
}
targetAlloc = val
}
if err := reallocFile(path, stats, targetAlloc); err != nil {
return err
}
fmt.Printf("%sRealloc completed.%s File updated: %s\n", colorGreen, colorReset, path)
return nil
}
func analyzeFile(path string) (*fileStats, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read file: %w", err)
}
lines := strings.Split(string(data), "\n")
stats := &fileStats{
Path: path,
LineCount: len(lines),
Lengths: make(map[int]int),
MinLen: 0,
MaxLen: 0,
MaxDataLen: 0,
SigCounts: make(map[string]int),
SigExampleKeys: make(map[string][]string),
}
var nonEmpty []string
for _, l := range lines {
if l == "" {
continue
}
nonEmpty = append(nonEmpty, l)
}
stats.NonEmptyCount = len(nonEmpty)
if len(nonEmpty) == 0 {
return stats, nil
}
// эвристика: если у значащих данных нет { ... } по краям, считаем, что это закодированное содержимое
encodedCandidates := 0
for _, l := range nonEmpty {
n := len(l)
stats.Lengths[n]++
if stats.MinLen == 0 || n < stats.MinLen {
stats.MinLen = n
}
if n > stats.MaxLen {
stats.MaxLen = n
}
trimmed := strings.TrimRight(l, " ")
if len(trimmed) > stats.MaxDataLen {
stats.MaxDataLen = len(trimmed)
}
// Значащие данные без { } по краям — считаем как "encoded"
core := strings.TrimSpace(l)
if core != "" && !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) {
encodedCandidates++
}
}
if stats.NonEmptyCount > 0 && encodedCandidates >= stats.NonEmptyCount/2 {
stats.ProbablyBase64 = true
}
// Доп.статистика по JSON-полям — только если это не похоже на закодированные строки
if !stats.ProbablyBase64 {
stats.MinFieldCount = 0
stats.MaxFieldCount = 0
stats.AllKeysSame = true
stats.AllFieldCountsSame = true
var refSig string
var refCount int
for _, l := range nonEmpty {
core := strings.TrimSpace(strings.TrimRight(l, " "))
if core == "" {
continue
}
// ожидаем JSON object
if !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) {
stats.JSONParseErrors++
continue
}
var obj map[string]any
if err := json.Unmarshal([]byte(core), &obj); err != nil {
stats.JSONParseErrors++
continue
}
stats.JSONObjects++
cnt := len(obj)
if stats.MinFieldCount == 0 || cnt < stats.MinFieldCount {
stats.MinFieldCount = cnt
}
if cnt > stats.MaxFieldCount {
stats.MaxFieldCount = cnt
}
keys := make([]string, 0, cnt)
for k := range obj {
keys = append(keys, k)
}
sortStrings(keys)
sig := strings.Join(keys, "\x1f") // case-sensitive
if refSig == "" {
refSig = sig
refCount = cnt
} else {
if sig != refSig {
stats.AllKeysSame = false
}
if cnt != refCount {
stats.AllFieldCountsSame = false
}
}
// collect limited signature stats
if _, ok := stats.SigCounts[sig]; ok || len(stats.SigCounts) < 10 {
stats.SigCounts[sig]++
if _, ok := stats.SigExampleKeys[sig]; !ok {
stats.SigExampleKeys[sig] = keys
}
} else {
stats.SigOverflow = true
}
}
// If no objects parsed, keep flags meaningful
if stats.JSONObjects == 0 {
stats.AllKeysSame = false
stats.AllFieldCountsSame = false
} else if stats.MinFieldCount != stats.MaxFieldCount {
stats.AllFieldCountsSame = false
}
}
return stats, nil
}
// encodeKeyBytes превращает строку ключа в 32 байта (SHA256) для AES-256
func encodeKeyBytes(keyStr string) []byte {
h := sha256.Sum256([]byte(keyStr))
return h[:]
}
// aesGCMDecrypt — расшифровка AES-256-GCM (nonce||ciphertext)
func aesGCMDecrypt(ciphertext, key []byte) ([]byte, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
aesgcm, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
nonceSize := aesgcm.NonceSize()
if len(ciphertext) < nonceSize {
return nil, fmt.Errorf("ciphertext too short")
}
nonce, data := ciphertext[:nonceSize], ciphertext[nonceSize:]
return aesgcm.Open(nil, nonce, data, nil)
}
// fillDecryptedJSONStats заполняет JSON-статистику.
// Если keyStr == "", предполагается, что строки содержат обычный JSON.
// Если keyStr != "", строки считаются base64(AES-GCM(JSON)).
func fillDecryptedJSONStats(path, keyStr string, stats *fileStats) error {
data, err := os.ReadFile(path)
if err != nil {
return fmt.Errorf("read file: %w", err)
}
lines := strings.Split(strings.TrimSuffix(string(data), "\n"), "\n")
// сбрасываем предыдущие JSON-статы
stats.JSONObjects = 0
stats.JSONParseErrors = 0
stats.MinFieldCount = 0
stats.MaxFieldCount = 0
stats.AllKeysSame = true
stats.AllFieldCountsSame = true
stats.SigOverflow = false
stats.SigCounts = make(map[string]int)
stats.SigExampleKeys = make(map[string][]string)
var key []byte
if keyStr != "" {
key = encodeKeyBytes(keyStr)
}
var refSig string
var refCount int
for _, l := range lines {
if l == "" {
continue
}
raw := strings.TrimRight(l, " ")
if raw == "" {
continue
}
var payload []byte
if key != nil {
enc, err := base64.StdEncoding.DecodeString(strings.TrimSpace(raw))
if err != nil {
stats.JSONParseErrors++
continue
}
plain, err := aesGCMDecrypt(enc, key)
if err != nil {
stats.JSONParseErrors++
continue
}
payload = plain
} else {
payload = []byte(strings.TrimSpace(raw))
}
core := strings.TrimSpace(string(payload))
if core == "" {
continue
}
if !(strings.HasPrefix(core, "{") && strings.HasSuffix(core, "}")) {
stats.JSONParseErrors++
continue
}
var obj map[string]any
if err := json.Unmarshal([]byte(core), &obj); err != nil {
stats.JSONParseErrors++
continue
}
stats.JSONObjects++
cnt := len(obj)
if stats.MinFieldCount == 0 || cnt < stats.MinFieldCount {
stats.MinFieldCount = cnt
}
if cnt > stats.MaxFieldCount {
stats.MaxFieldCount = cnt
}
keys := make([]string, 0, cnt)
for k := range obj {
keys = append(keys, k)
}
sortStrings(keys)
sig := strings.Join(keys, "\x1f")
if refSig == "" {
refSig = sig
refCount = cnt
} else {
if sig != refSig {
stats.AllKeysSame = false
}
if cnt != refCount {
stats.AllFieldCountsSame = false
}
}
if _, ok := stats.SigCounts[sig]; ok || len(stats.SigCounts) < 10 {
stats.SigCounts[sig]++
if _, ok := stats.SigExampleKeys[sig]; !ok {
stats.SigExampleKeys[sig] = keys
}
} else {
stats.SigOverflow = true
}
}
if stats.JSONObjects == 0 {
stats.AllKeysSame = false
stats.AllFieldCountsSame = false
} else if stats.MinFieldCount != stats.MaxFieldCount {
stats.AllFieldCountsSame = false
}
return nil
}
func printStats(s *fileStats) {
fmt.Printf("\n%sStats for%s %s%s%s\n", colorBold, colorReset, colorCyan, s.Path, colorReset)
fmt.Printf(" Total lines: %d\n", s.LineCount)
fmt.Printf(" Non-empty lines: %d\n", s.NonEmptyCount)
fmt.Printf(" Min line length: %d\n", s.MinLen)
fmt.Printf(" Max line length: %d\n", s.MaxLen)
fmt.Printf(" Max data length: %d (without trailing spaces)\n", s.MaxDataLen)
fmt.Printf(" Unique lengths: ")
first := true
for l, c := range s.Lengths {
if !first {
fmt.Print(", ")
}
first = false
fmt.Printf("%d(%d)", l, c)
}
fmt.Println()
if s.ProbablyBase64 {
fmt.Printf(" Encoded: %slikely encoded%s (no { } around data)\n", colorYellow, colorReset)
// Если удалось расшифровать/распарсить JSON — покажем статистику по полям
if s.JSONObjects > 0 || s.JSONParseErrors > 0 {
fmt.Printf(" Decoded JSON: %s%s%s\n", colorCyan, "analysis", colorReset)
fmt.Printf(" JSON objects: %d\n", s.JSONObjects)
if s.JSONParseErrors > 0 {
fmt.Printf(" JSON errors: %s%d%s\n", colorYellow, s.JSONParseErrors, colorReset)
} else {
fmt.Printf(" JSON errors: %s0%s\n", colorGreen, colorReset)
}
if s.JSONObjects > 0 {
fmt.Printf(" Fields per rec: min=%d max=%d\n", s.MinFieldCount, s.MaxFieldCount)
if s.AllFieldCountsSame {
fmt.Printf(" Field counts: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field counts: %sNOT equal%s\n", colorRed, colorReset)
}
if s.AllKeysSame {
fmt.Printf(" Field names: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field names: %sNOT equal%s\n", colorRed, colorReset)
}
if len(s.SigCounts) > 0 {
schemaCount := len(s.SigCounts)
extra := ""
if s.SigOverflow {
extra = " (showing first 10)"
}
color := colorGreen
if schemaCount > 1 {
color = colorYellow
}
fmt.Printf(" Schemas: %s%d%s%s\n", color, schemaCount, colorReset, extra)
type kv struct {
sig string
count int
}
var list []kv
for sig, c := range s.SigCounts {
list = append(list, kv{sig: sig, count: c})
}
if len(list) > 0 {
ref := list[0]
refKeys := s.SigExampleKeys[ref.sig]
fmt.Printf(" Ref schema keys (%s%d%s recs): %v\n", colorCyan, ref.count, colorReset, refKeys)
for _, kv := range list[1:] {
if kv.sig != ref.sig {
diffKeys := s.SigExampleKeys[kv.sig]
fmt.Printf(" Diff schema keys (%s%d%s recs): %v\n", colorYellow, kv.count, colorReset, diffKeys)
break
}
}
}
}
}
}
} else {
fmt.Printf(" Encoded: %slooks like plain JSON%s\n", colorGreen, colorReset)
fmt.Printf(" JSON objects: %d\n", s.JSONObjects)
if s.JSONParseErrors > 0 {
fmt.Printf(" JSON errors: %s%d%s\n", colorYellow, s.JSONParseErrors, colorReset)
} else {
fmt.Printf(" JSON errors: %s0%s\n", colorGreen, colorReset)
}
if s.JSONObjects > 0 {
fmt.Printf(" Fields per rec: min=%d max=%d\n", s.MinFieldCount, s.MaxFieldCount)
if s.AllFieldCountsSame {
fmt.Printf(" Field counts: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field counts: %sNOT equal%s\n", colorRed, colorReset)
}
if s.AllKeysSame {
fmt.Printf(" Field names: %sall equal%s\n", colorGreen, colorReset)
} else {
fmt.Printf(" Field names: %sNOT equal%s\n", colorRed, colorReset)
}
if len(s.SigCounts) > 0 {
schemaCount := len(s.SigCounts)
extra := ""
if s.SigOverflow {
extra = " (showing first 10)"
}
color := colorGreen
if schemaCount > 1 {
color = colorYellow
}
fmt.Printf(" Schemas: %s%d%s%s\n", color, schemaCount, colorReset, extra)
// Показать пример эталонной и отличающейся схем
type kv struct {
sig string
count int
}
var list []kv
for sig, c := range s.SigCounts {
list = append(list, kv{sig: sig, count: c})
}
// не сортируем по алфавиту, просто первый будет эталоном
if len(list) > 0 {
ref := list[0]
refKeys := s.SigExampleKeys[ref.sig]
fmt.Printf(" Ref schema keys (%s%d%s recs): %v\n", colorCyan, ref.count, colorReset, refKeys)
// найдём первую отличающуюся схему
for _, kv := range list[1:] {
if kv.sig != ref.sig {
diffKeys := s.SigExampleKeys[kv.sig]
fmt.Printf(" Diff schema keys (%s%d%s recs): %v\n", colorYellow, kv.count, colorReset, diffKeys)
break
}
}
}
}
}
}
}
func sortStrings(a []string) {
// simple insertion sort to avoid extra deps
for i := 1; i < len(a); i++ {
j := i
for j > 0 && a[j-1] > a[j] {
a[j-1], a[j] = a[j], a[j-1]
j--
}
}
}
func reallocFile(path string, stats *fileStats, newAlloc int) error {
if stats.NonEmptyCount == 0 {
fmt.Printf("%sFile is empty, nothing to realloc.%s\n", colorYellow, colorReset)
return nil
}
targetLen := newAlloc - 1
if targetLen < 1 {
return fmt.Errorf("target allocSize too small")
}
if stats.MaxDataLen > targetLen {
return fmt.Errorf("cannot set allocSize=%d: max data length %d would be truncated", newAlloc, stats.MaxDataLen)
}
data, err := os.ReadFile(path)
if err != nil {
return fmt.Errorf("read file: %w", err)
}
lines := strings.Split(strings.TrimSuffix(string(data), "\n"), "\n")
var buf bytes.Buffer
for _, l := range lines {
if l == "" {
continue
}
trimmed := strings.TrimRight(l, " ")
if trimmed == "" {
// строка содержит только пробелы — считаем удалённой и пропускаем
continue
}
if len(trimmed) > targetLen {
return fmt.Errorf("line data length %d exceeds targetLen %d", len(trimmed), targetLen)
}
padded := trimmed + strings.Repeat(" ", targetLen-len(trimmed))
buf.WriteString(padded)
buf.WriteByte('\n')
}
// backup
backup := path + ".bak"
if err := os.WriteFile(backup, data, 0644); err != nil {
return fmt.Errorf("write backup: %w", err)
}
if err := os.WriteFile(path, buf.Bytes(), 0644); err != nil {
return fmt.Errorf("write new file: %w", err)
}
fmt.Printf("%sBackup saved%s to %s\n", colorGreen, colorReset, backup)
return nil
}