[CLAUDE] Scaffold: khoi tao SOLUTION_ERP Phase 0
- .NET 10 Clean Architecture: Domain/Application/Infrastructure/Api (4 project) - 2 React + Vite + TS app: fe-admin (:8082), fe-user (:8080) voi proxy /api - Node engines >=20, .nvmrc = 20 cho CI (bai hoc NamGroup) - SQL Server 2022 qua docker-compose (dev) - Parse 8 FORM -> docs/forms-spec.md (catalog + ma HD format RG-001) - Parse QUY_TRINH -> docs/workflow-contract.md (9 phase state machine + role matrix) - docs: CLAUDE.md, STATUS.md, PROJECT-MAP.md, migration-todos.md (roadmap 5 phase) - .claude/skills: 3 placeholder (contract-workflow, form-engine, permission-matrix) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
100
scripts/parse_forms.py
Normal file
100
scripts/parse_forms.py
Normal file
@ -0,0 +1,100 @@
|
||||
"""Extract text + structure from 8 FORM files (.docx, .doc, .xlsx).
|
||||
|
||||
Outputs: docs/forms-spec-raw.md — dump text + tables for manual field spec extraction.
|
||||
|
||||
Usage: python scripts/parse_forms.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
import docx
|
||||
from openpyxl import load_workbook
|
||||
except ImportError:
|
||||
sys.stderr.write("pip install python-docx openpyxl\n")
|
||||
sys.exit(1)
|
||||
|
||||
FORM_DIR = Path("D:/Dropbox/CONG_VIEC/SOLUTION/FORM")
|
||||
OUT = Path(__file__).parent.parent / "docs" / "forms-spec-raw.md"
|
||||
|
||||
W_NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||
|
||||
|
||||
def extract_doc_via_zip(path: Path) -> str:
|
||||
"""Fallback for .doc (binary) or .docx — unzip and read word/document.xml."""
|
||||
try:
|
||||
with zipfile.ZipFile(path) as z:
|
||||
with z.open("word/document.xml") as f:
|
||||
tree = ET.parse(f)
|
||||
root = tree.getroot()
|
||||
texts: list[str] = []
|
||||
for para in root.iter(f"{{{W_NS['w']}}}p"):
|
||||
line = "".join(t.text or "" for t in para.iter(f"{{{W_NS['w']}}}t"))
|
||||
if line.strip():
|
||||
texts.append(line)
|
||||
return "\n".join(texts)
|
||||
except Exception as e:
|
||||
return f"[ERROR unzip: {e}]"
|
||||
|
||||
|
||||
def extract_docx(path: Path) -> str:
|
||||
try:
|
||||
d = docx.Document(str(path))
|
||||
parts: list[str] = []
|
||||
for p in d.paragraphs:
|
||||
if p.text.strip():
|
||||
parts.append(p.text)
|
||||
# Tables
|
||||
for i, tbl in enumerate(d.tables):
|
||||
parts.append(f"\n--- TABLE {i+1} ({len(tbl.rows)} rows x {len(tbl.columns)} cols) ---")
|
||||
for row in tbl.rows:
|
||||
cells = [c.text.strip().replace("\n", " | ") for c in row.cells]
|
||||
parts.append(" || ".join(cells))
|
||||
return "\n".join(parts)
|
||||
except Exception:
|
||||
return extract_doc_via_zip(path)
|
||||
|
||||
|
||||
def extract_xlsx(path: Path) -> str:
|
||||
wb = load_workbook(str(path), data_only=True)
|
||||
parts: list[str] = []
|
||||
for sh_name in wb.sheetnames:
|
||||
ws = wb[sh_name]
|
||||
parts.append(f"\n--- SHEET: {sh_name} ({ws.max_row} rows x {ws.max_column} cols) ---")
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
vals = [str(v).strip() if v is not None else "" for v in row]
|
||||
if any(v for v in vals):
|
||||
parts.append(" | ".join(vals))
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
files = sorted(FORM_DIR.iterdir())
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
with OUT.open("w", encoding="utf-8") as out:
|
||||
out.write("# Forms — Raw Text Dump\n\n")
|
||||
out.write(f"Source: `{FORM_DIR}` — {len(files)} files\n\n")
|
||||
for f in files:
|
||||
out.write(f"\n---\n\n## {f.name}\n\n")
|
||||
ext = f.suffix.lower()
|
||||
if ext == ".xlsx":
|
||||
text = extract_xlsx(f)
|
||||
elif ext in (".docx", ".doc"):
|
||||
text = extract_docx(f)
|
||||
else:
|
||||
text = "[unsupported]"
|
||||
out.write("```\n")
|
||||
out.write(text[:20000])
|
||||
if len(text) > 20000:
|
||||
out.write(f"\n\n[... truncated {len(text)-20000} chars]")
|
||||
out.write("\n```\n")
|
||||
print(f"OK -> {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user