- .NET 10 Clean Architecture: Domain/Application/Infrastructure/Api (4 project) - 2 React + Vite + TS app: fe-admin (:8082), fe-user (:8080) voi proxy /api - Node engines >=20, .nvmrc = 20 cho CI (bai hoc NamGroup) - SQL Server 2022 qua docker-compose (dev) - Parse 8 FORM -> docs/forms-spec.md (catalog + ma HD format RG-001) - Parse QUY_TRINH -> docs/workflow-contract.md (9 phase state machine + role matrix) - docs: CLAUDE.md, STATUS.md, PROJECT-MAP.md, migration-todos.md (roadmap 5 phase) - .claude/skills: 3 placeholder (contract-workflow, form-engine, permission-matrix) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
"""Extract text + structure from 8 FORM files (.docx, .doc, .xlsx).
|
|
|
|
Outputs: docs/forms-spec-raw.md — dump text + tables for manual field spec extraction.
|
|
|
|
Usage: python scripts/parse_forms.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
from xml.etree import ElementTree as ET
|
|
|
|
try:
|
|
import docx
|
|
from openpyxl import load_workbook
|
|
except ImportError:
|
|
sys.stderr.write("pip install python-docx openpyxl\n")
|
|
sys.exit(1)
|
|
|
|
FORM_DIR = Path("D:/Dropbox/CONG_VIEC/SOLUTION/FORM")
|
|
OUT = Path(__file__).parent.parent / "docs" / "forms-spec-raw.md"
|
|
|
|
W_NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
|
|
|
|
|
def extract_doc_via_zip(path: Path) -> str:
|
|
"""Fallback for .doc (binary) or .docx — unzip and read word/document.xml."""
|
|
try:
|
|
with zipfile.ZipFile(path) as z:
|
|
with z.open("word/document.xml") as f:
|
|
tree = ET.parse(f)
|
|
root = tree.getroot()
|
|
texts: list[str] = []
|
|
for para in root.iter(f"{{{W_NS['w']}}}p"):
|
|
line = "".join(t.text or "" for t in para.iter(f"{{{W_NS['w']}}}t"))
|
|
if line.strip():
|
|
texts.append(line)
|
|
return "\n".join(texts)
|
|
except Exception as e:
|
|
return f"[ERROR unzip: {e}]"
|
|
|
|
|
|
def extract_docx(path: Path) -> str:
|
|
try:
|
|
d = docx.Document(str(path))
|
|
parts: list[str] = []
|
|
for p in d.paragraphs:
|
|
if p.text.strip():
|
|
parts.append(p.text)
|
|
# Tables
|
|
for i, tbl in enumerate(d.tables):
|
|
parts.append(f"\n--- TABLE {i+1} ({len(tbl.rows)} rows x {len(tbl.columns)} cols) ---")
|
|
for row in tbl.rows:
|
|
cells = [c.text.strip().replace("\n", " | ") for c in row.cells]
|
|
parts.append(" || ".join(cells))
|
|
return "\n".join(parts)
|
|
except Exception:
|
|
return extract_doc_via_zip(path)
|
|
|
|
|
|
def extract_xlsx(path: Path) -> str:
|
|
wb = load_workbook(str(path), data_only=True)
|
|
parts: list[str] = []
|
|
for sh_name in wb.sheetnames:
|
|
ws = wb[sh_name]
|
|
parts.append(f"\n--- SHEET: {sh_name} ({ws.max_row} rows x {ws.max_column} cols) ---")
|
|
for row in ws.iter_rows(values_only=True):
|
|
vals = [str(v).strip() if v is not None else "" for v in row]
|
|
if any(v for v in vals):
|
|
parts.append(" | ".join(vals))
|
|
return "\n".join(parts)
|
|
|
|
|
|
def main() -> None:
|
|
files = sorted(FORM_DIR.iterdir())
|
|
OUT.parent.mkdir(parents=True, exist_ok=True)
|
|
with OUT.open("w", encoding="utf-8") as out:
|
|
out.write("# Forms — Raw Text Dump\n\n")
|
|
out.write(f"Source: `{FORM_DIR}` — {len(files)} files\n\n")
|
|
for f in files:
|
|
out.write(f"\n---\n\n## {f.name}\n\n")
|
|
ext = f.suffix.lower()
|
|
if ext == ".xlsx":
|
|
text = extract_xlsx(f)
|
|
elif ext in (".docx", ".doc"):
|
|
text = extract_docx(f)
|
|
else:
|
|
text = "[unsupported]"
|
|
out.write("```\n")
|
|
out.write(text[:20000])
|
|
if len(text) > 20000:
|
|
out.write(f"\n\n[... truncated {len(text)-20000} chars]")
|
|
out.write("\n```\n")
|
|
print(f"OK -> {OUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|