#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ nts_eva_pie.py — Confronto fonetico EVA–PIE (esteso) ---------------------------------------------------- - Similarità Levenshtein normalizzata (0–1) + filtri opzionali. - Input: corpus EVA + dizionario radici PIE (inline o file JSON esterno). - Output: CSV con tutte le corrispondenze e JSON con statistiche. Esempio: python nts_eva_pie.py \ --eva "/mnt/data/voinich EVA.txt" \ --min_sim 0.5 --top 200 \ --out_csv "/mnt/data/nts_outputs/eva_pie_similarity.csv" \ --out_json "/mnt/data/nts_outputs/eva_pie_summary.json" """ import argparse, re, json from pathlib import Path from collections import Counter DEFAULT_PIE = { "gen": "generare, nascere", "der": "tenere, sostenere", "lei": "scorrere, fluire", "tel": "portare, completare", "al": "crescere, nutrire", "wer": "girare, parlare", "ser": "legare, connettere", "nei": "respirare, vivere", "kei": "formare, creare", "pel": "muovere, spingere" } def tokenize_path(path): txt = Path(path).read_text(encoding="utf-8", errors="ignore") return re.findall(r"[a-z]+", txt.lower()) def load_pie(path_json): if not path_json: return DEFAULT_PIE return json.loads(Path(path_json).read_text(encoding="utf-8")) def similarity(a, b): a = re.sub(r'[^a-z]', '', a.lower()) b = re.sub(r'[^a-z]', '', b.lower()) m, n = len(a), len(b) if m == 0 or n == 0: return 0.0 dp = [[0]*(n+1) for _ in range(m+1)] for i in range(m+1): for j in range(n+1): if i == 0: dp[i][j] = j elif j == 0: dp[i][j] = i elif a[i-1] == b[j-1]: dp[i][j] = dp[i-1][j-1] else: dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) dist = dp[m][n] return max(0.0, 1 - dist / max(m, n)) def main(): ap = argparse.ArgumentParser() ap.add_argument("--eva", required=True, help="Corpus EVA (txt)") ap.add_argument("--pie_json", default="", help="Dizionario PIE in JSON {root: meaning}") ap.add_argument("--min_sim", type=float, default=0.5, help="Soglia di similarità") ap.add_argument("--top", type=int, default=200, help="Limite righe CSV") ap.add_argument("--out_csv", default="", help="Output CSV con corrispondenze") ap.add_argument("--out_json", default="", help="Riepilogo JSON") args = ap.parse_args() words = tokenize_path(args.eva) pie = load_pie(args.pie_json) results = [] for w in words: best_root = None best_sim = 0.0 for r in pie: sim = similarity(w, r) if sim >= args.min_sim and sim > best_sim: best_sim = sim best_root = r if best_root: results.append((w, best_root, pie[best_root], round(best_sim, 3))) # ordina per similarità e limita per output results.sort(key=lambda x: (-x[3], x[0])) top_res = results[:args.top] if args.out_csv: from csv import writer Path(args.out_csv).parent.mkdir(parents=True, exist_ok=True) with open(args.out_csv, "w", encoding="utf-8", newline="") as f: w = writer(f) w.writerow(["Parola_EVA","Radice_PIE","Significato","Affinita"]) for row in results: w.writerow(row) # Riepilogo from collections import Counter roots = Counter([r for _,r,_,_ in results]) summary = { "params": {"eva_path": args.eva, "min_sim": args.min_sim, "roots": len(pie)}, "matches": len(results), "top_roots": roots.most_common(20), "sample_preview": top_res } if args.out_json: Path(args.out_json).parent.mkdir(parents=True, exist_ok=True) Path(args.out_json).write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8") # Log print("=== EVA–PIE (esteso) ===") print(f"Parole EVA scansionate: {len(words)} | Corrispondenze: {len(results)} | Soglia: {args.min_sim}") print("Radici PIE più frequenti:", ", ".join([f"{r}×{c}" for r,c in roots.most_common(10)])) if __name__ == "__main__": main()