#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ nts_stability_s1s5.py — Stabilità S₁–S₅ su finestre (esteso) ------------------------------------------------------------- - Calcola H, MI, P_s, STS su K finestre di dimensione W (con shift configurabile). - Normalizza ogni serie (0–1), calcola deviazione standard e produce indici S1..S5. - Esporta CSV per-finestra e JSON riepilogo. Esempio: python nts_stability_s1s5.py \ --eva "/mnt/data/voinich EVA.txt" \ --window 50 --windows 8 \ --out_csv "/mnt/data/nts_outputs/stability_windows.csv" \ --out_json "/mnt/data/nts_outputs/stability_summary.json" """ import argparse, re, math, json, statistics from collections import Counter from pathlib import Path import pandas as pd def tokenize_path(path): txt = Path(path).read_text(encoding="utf-8", errors="ignore") return re.findall(r"[a-z]+", txt.lower()) def compute_metrics(words): total = len(words) freq = Counter(words) pairs = list(zip(words, words[1:])) pair_counts = Counter(pairs) pair_total = max(1, len(pairs)) probs = [c/total for c in freq.values()] if total else [] H = -sum(p * math.log2(p) for p in probs) if probs else 0.0 MI = 0.0 for (w1, w2), c in pair_counts.items(): p_xy = c / pair_total p_x = freq[w1] / total p_y = freq[w2] / total if p_xy > 0 and p_x > 0 and p_y > 0: MI += p_xy * math.log2(p_xy / (p_x * p_y)) P_s = len(freq) / total if total else 0.0 STS = len(pair_counts) / pair_total if pair_total else 0.0 return {"H": H, "MI": MI, "P_s": P_s, "STS": STS, "N": total, "V": len(freq)} def normalize(series): lo, hi = min(series), max(series) if hi - lo <= 1e-12: return [0.0 for _ in series] return [(x - lo) / (hi - lo) for x in series] def main(): ap = argparse.ArgumentParser() ap.add_argument("--eva", required=True, help="Corpus EVA (txt)") ap.add_argument("--window", type=int, default=50, help="Dimensione finestra (token)") ap.add_argument("--windows", type=int, default=8, help="Numero di finestre") ap.add_argument("--shift", type=int, default=0, help="Shift tra finestre (0=non sovrapposte)") ap.add_argument("--out_csv", default="", help="CSV per-finestra") ap.add_argument("--out_json", default="", help="Riepilogo JSON") args = ap.parse_args() tokens = tokenize_path(args.eva) W, K = args.window, args.windows S = args.shift if args.shift>0 else W rows = [] start = 0 for i in range(K): end = start + W words = tokens[start:end] if not words: break m = compute_metrics(words) rows.append({"window": i+1, **m}) start += S if not rows: raise SystemExit("Nessuna finestra analizzata.") df = pd.DataFrame(rows) # Normalizzazione Hn = normalize(df["H"].tolist()) MIn = normalize(df["MI"].tolist()) Psn = normalize(df["P_s"].tolist()) STSn = normalize(df["STS"].tolist()) S1 = 1 - statistics.pstdev(Psn) S2 = 1 - statistics.pstdev(STSn) S3 = 1 - statistics.pstdev(Hn) S4 = 1 - statistics.pstdev(MIn) S5 = (S1 + S2 + S3 + S4) / 4 if args.out_csv: Path(args.out_csv).parent.mkdir(parents=True, exist_ok=True) df.to_csv(args.out_csv, index=False, encoding="utf-8") summary = { "params": {"window": W, "windows": len(rows), "shift": S, "eva_path": args.eva}, "S": {"S1": S1, "S2": S2, "S3": S3, "S4": S4, "S5": S5}, "means": { "H": float(df["H"].mean()), "MI": float(df["MI"].mean()), "P_s": float(df["P_s"].mean()), "STS": float(df["STS"].mean()) } } if args.out_json: Path(args.out_json).parent.mkdir(parents=True, exist_ok=True) Path(args.out_json).write_text(json.dumps(summary, indent=2), encoding="utf-8") # Log print("=== Stabilità S1..S5 (esteso) ===") print(f"Finestre analizzate: {len(rows)} | W={W} | Shift={S}") for k,v in summary["S"].items(): print(f"{k}: {v:.3f}") if __name__ == "__main__": main()