Files
solution-erp/scripts/parse_forms.py
pqhuy1987 25dad7f36f [CLAUDE] Scaffold: khoi tao SOLUTION_ERP Phase 0
- .NET 10 Clean Architecture: Domain/Application/Infrastructure/Api (4 project)
- 2 React + Vite + TS app: fe-admin (:8082), fe-user (:8080) voi proxy /api
- Node engines >=20, .nvmrc = 20 cho CI (bai hoc NamGroup)
- SQL Server 2022 qua docker-compose (dev)
- Parse 8 FORM -> docs/forms-spec.md (catalog + ma HD format RG-001)
- Parse QUY_TRINH -> docs/workflow-contract.md (9 phase state machine + role matrix)
- docs: CLAUDE.md, STATUS.md, PROJECT-MAP.md, migration-todos.md (roadmap 5 phase)
- .claude/skills: 3 placeholder (contract-workflow, form-engine, permission-matrix)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 10:37:34 +07:00

101 lines
3.2 KiB
Python

"""Extract text + structure from 8 FORM files (.docx, .doc, .xlsx).
Outputs: docs/forms-spec-raw.md — dump text + tables for manual field spec extraction.
Usage: python scripts/parse_forms.py
"""
from __future__ import annotations
import os
import sys
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET
try:
import docx
from openpyxl import load_workbook
except ImportError:
sys.stderr.write("pip install python-docx openpyxl\n")
sys.exit(1)
FORM_DIR = Path("D:/Dropbox/CONG_VIEC/SOLUTION/FORM")
OUT = Path(__file__).parent.parent / "docs" / "forms-spec-raw.md"
W_NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
def extract_doc_via_zip(path: Path) -> str:
"""Fallback for .doc (binary) or .docx — unzip and read word/document.xml."""
try:
with zipfile.ZipFile(path) as z:
with z.open("word/document.xml") as f:
tree = ET.parse(f)
root = tree.getroot()
texts: list[str] = []
for para in root.iter(f"{{{W_NS['w']}}}p"):
line = "".join(t.text or "" for t in para.iter(f"{{{W_NS['w']}}}t"))
if line.strip():
texts.append(line)
return "\n".join(texts)
except Exception as e:
return f"[ERROR unzip: {e}]"
def extract_docx(path: Path) -> str:
try:
d = docx.Document(str(path))
parts: list[str] = []
for p in d.paragraphs:
if p.text.strip():
parts.append(p.text)
# Tables
for i, tbl in enumerate(d.tables):
parts.append(f"\n--- TABLE {i+1} ({len(tbl.rows)} rows x {len(tbl.columns)} cols) ---")
for row in tbl.rows:
cells = [c.text.strip().replace("\n", " | ") for c in row.cells]
parts.append(" || ".join(cells))
return "\n".join(parts)
except Exception:
return extract_doc_via_zip(path)
def extract_xlsx(path: Path) -> str:
wb = load_workbook(str(path), data_only=True)
parts: list[str] = []
for sh_name in wb.sheetnames:
ws = wb[sh_name]
parts.append(f"\n--- SHEET: {sh_name} ({ws.max_row} rows x {ws.max_column} cols) ---")
for row in ws.iter_rows(values_only=True):
vals = [str(v).strip() if v is not None else "" for v in row]
if any(v for v in vals):
parts.append(" | ".join(vals))
return "\n".join(parts)
def main() -> None:
files = sorted(FORM_DIR.iterdir())
OUT.parent.mkdir(parents=True, exist_ok=True)
with OUT.open("w", encoding="utf-8") as out:
out.write("# Forms — Raw Text Dump\n\n")
out.write(f"Source: `{FORM_DIR}` — {len(files)} files\n\n")
for f in files:
out.write(f"\n---\n\n## {f.name}\n\n")
ext = f.suffix.lower()
if ext == ".xlsx":
text = extract_xlsx(f)
elif ext in (".docx", ".doc"):
text = extract_docx(f)
else:
text = "[unsupported]"
out.write("```\n")
out.write(text[:20000])
if len(text) > 20000:
out.write(f"\n\n[... truncated {len(text)-20000} chars]")
out.write("\n```\n")
print(f"OK -> {OUT}")
if __name__ == "__main__":
main()