"""Extract text + structure from 8 FORM files (.docx, .doc, .xlsx). Outputs: docs/forms-spec-raw.md — dump text + tables for manual field spec extraction. Usage: python scripts/parse_forms.py """ from __future__ import annotations import os import sys import zipfile from pathlib import Path from xml.etree import ElementTree as ET try: import docx from openpyxl import load_workbook except ImportError: sys.stderr.write("pip install python-docx openpyxl\n") sys.exit(1) FORM_DIR = Path("D:/Dropbox/CONG_VIEC/SOLUTION/FORM") OUT = Path(__file__).parent.parent / "docs" / "forms-spec-raw.md" W_NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} def extract_doc_via_zip(path: Path) -> str: """Fallback for .doc (binary) or .docx — unzip and read word/document.xml.""" try: with zipfile.ZipFile(path) as z: with z.open("word/document.xml") as f: tree = ET.parse(f) root = tree.getroot() texts: list[str] = [] for para in root.iter(f"{{{W_NS['w']}}}p"): line = "".join(t.text or "" for t in para.iter(f"{{{W_NS['w']}}}t")) if line.strip(): texts.append(line) return "\n".join(texts) except Exception as e: return f"[ERROR unzip: {e}]" def extract_docx(path: Path) -> str: try: d = docx.Document(str(path)) parts: list[str] = [] for p in d.paragraphs: if p.text.strip(): parts.append(p.text) # Tables for i, tbl in enumerate(d.tables): parts.append(f"\n--- TABLE {i+1} ({len(tbl.rows)} rows x {len(tbl.columns)} cols) ---") for row in tbl.rows: cells = [c.text.strip().replace("\n", " | ") for c in row.cells] parts.append(" || ".join(cells)) return "\n".join(parts) except Exception: return extract_doc_via_zip(path) def extract_xlsx(path: Path) -> str: wb = load_workbook(str(path), data_only=True) parts: list[str] = [] for sh_name in wb.sheetnames: ws = wb[sh_name] parts.append(f"\n--- SHEET: {sh_name} ({ws.max_row} rows x {ws.max_column} cols) ---") for row in ws.iter_rows(values_only=True): vals = [str(v).strip() if v is not None else "" for v in row] if any(v for v in vals): parts.append(" | ".join(vals)) return "\n".join(parts) def main() -> None: files = sorted(FORM_DIR.iterdir()) OUT.parent.mkdir(parents=True, exist_ok=True) with OUT.open("w", encoding="utf-8") as out: out.write("# Forms — Raw Text Dump\n\n") out.write(f"Source: `{FORM_DIR}` — {len(files)} files\n\n") for f in files: out.write(f"\n---\n\n## {f.name}\n\n") ext = f.suffix.lower() if ext == ".xlsx": text = extract_xlsx(f) elif ext in (".docx", ".doc"): text = extract_docx(f) else: text = "[unsupported]" out.write("```\n") out.write(text[:20000]) if len(text) > 20000: out.write(f"\n\n[... truncated {len(text)-20000} chars]") out.write("\n```\n") print(f"OK -> {OUT}") if __name__ == "__main__": main()