#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
nts_stability_s1s5.py — Stabilità S₁–S₅ su finestre (esteso)
-------------------------------------------------------------
- Calcola H, MI, P_s, STS su K finestre di dimensione W (con shift configurabile).
- Normalizza ogni serie (0–1), calcola deviazione standard e produce indici S1..S5.
- Esporta CSV per-finestra e JSON riepilogo.

Esempio:
  python nts_stability_s1s5.py \
    --eva "/mnt/data/voinich EVA.txt" \
    --window 50 --windows 8 \
    --out_csv "/mnt/data/nts_outputs/stability_windows.csv" \
    --out_json "/mnt/data/nts_outputs/stability_summary.json"
"""
import argparse, re, math, json, statistics
from collections import Counter
from pathlib import Path
import pandas as pd

def tokenize_path(path):
    txt = Path(path).read_text(encoding="utf-8", errors="ignore")
    return re.findall(r"[a-z]+", txt.lower())

def compute_metrics(words):
    total = len(words)
    freq = Counter(words)
    pairs = list(zip(words, words[1:]))
    pair_counts = Counter(pairs)
    pair_total = max(1, len(pairs))
    probs = [c/total for c in freq.values()] if total else []
    H = -sum(p * math.log2(p) for p in probs) if probs else 0.0
    MI = 0.0
    for (w1, w2), c in pair_counts.items():
        p_xy = c / pair_total
        p_x = freq[w1] / total
        p_y = freq[w2] / total
        if p_xy > 0 and p_x > 0 and p_y > 0:
            MI += p_xy * math.log2(p_xy / (p_x * p_y))
    P_s = len(freq) / total if total else 0.0
    STS = len(pair_counts) / pair_total if pair_total else 0.0
    return {"H": H, "MI": MI, "P_s": P_s, "STS": STS, "N": total, "V": len(freq)}

def normalize(series):
    lo, hi = min(series), max(series)
    if hi - lo <= 1e-12:
        return [0.0 for _ in series]
    return [(x - lo) / (hi - lo) for x in series]

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--eva", required=True, help="Corpus EVA (txt)")
    ap.add_argument("--window", type=int, default=50, help="Dimensione finestra (token)")
    ap.add_argument("--windows", type=int, default=8, help="Numero di finestre")
    ap.add_argument("--shift", type=int, default=0, help="Shift tra finestre (0=non sovrapposte)")
    ap.add_argument("--out_csv", default="", help="CSV per-finestra")
    ap.add_argument("--out_json", default="", help="Riepilogo JSON")
    args = ap.parse_args()

    tokens = tokenize_path(args.eva)
    W, K = args.window, args.windows
    S = args.shift if args.shift>0 else W

    rows = []
    start = 0
    for i in range(K):
        end = start + W
        words = tokens[start:end]
        if not words: break
        m = compute_metrics(words)
        rows.append({"window": i+1, **m})
        start += S

    if not rows:
        raise SystemExit("Nessuna finestra analizzata.")

    df = pd.DataFrame(rows)

    # Normalizzazione
    Hn = normalize(df["H"].tolist())
    MIn = normalize(df["MI"].tolist())
    Psn = normalize(df["P_s"].tolist())
    STSn = normalize(df["STS"].tolist())

    S1 = 1 - statistics.pstdev(Psn)
    S2 = 1 - statistics.pstdev(STSn)
    S3 = 1 - statistics.pstdev(Hn)
    S4 = 1 - statistics.pstdev(MIn)
    S5 = (S1 + S2 + S3 + S4) / 4

    if args.out_csv:
        Path(args.out_csv).parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(args.out_csv, index=False, encoding="utf-8")

    summary = {
        "params": {"window": W, "windows": len(rows), "shift": S, "eva_path": args.eva},
        "S": {"S1": S1, "S2": S2, "S3": S3, "S4": S4, "S5": S5},
        "means": {
            "H": float(df["H"].mean()), "MI": float(df["MI"].mean()),
            "P_s": float(df["P_s"].mean()), "STS": float(df["STS"].mean())
        }
    }
    if args.out_json:
        Path(args.out_json).parent.mkdir(parents=True, exist_ok=True)
        Path(args.out_json).write_text(json.dumps(summary, indent=2), encoding="utf-8")

    # Log
    print("=== Stabilità S1..S5 (esteso) ===")
    print(f"Finestre analizzate: {len(rows)} | W={W} | Shift={S}")
    for k,v in summary["S"].items():
        print(f"{k}: {v:.3f}")

if __name__ == "__main__":
    main()