#!/usr/bin/env python3 """Validate a generated tutorial package before delivery.""" from __future__ import annotations import argparse import json import os import re import shutil import sys import zipfile from dataclasses import dataclass from pathlib import Path @dataclass class Check: status: str message: str class Validator: def __init__(self, package_dir: Path, basename: str, formats: list[str], check_deps: bool, strict: bool) -> None: self.package_dir = package_dir self.basename = basename self.formats = formats self.check_deps = check_deps self.strict = strict self.checks: list[Check] = [] self.chapter_numbers: list[int] = [] self.chapter_sections: list[tuple[int, str, str]] = [] self.markdown_image_refs: list[str] = [] def add(self, status: str, message: str) -> None: self.checks.append(Check(status, message)) def pass_(self, message: str) -> None: self.add("PASS", message) def warn(self, message: str) -> None: self.add("WARN", message) def fail(self, message: str) -> None: self.add("FAIL", message) def run(self) -> int: if not self.package_dir.exists(): self.fail(f"Package directory not found: {self.package_dir}") return self.finish() if self.check_deps: self.validate_dependencies() self.validate_package_files() self.validate_markdown() self.validate_visuals() self.validate_html() self.validate_docx() self.validate_pdf() self.validate_no_absolute_paths() self.validate_public_provenance_hygiene() return self.finish() def finish(self) -> int: failures = [check for check in self.checks if check.status == "FAIL"] warnings = [check for check in self.checks if check.status == "WARN"] for check in self.checks: print(f"[{check.status}] {check.message}") print(f"\nSummary: {len(failures)} failures, {len(warnings)} warnings, {len(self.checks)} checks") if failures: return 1 if self.strict and warnings: return 2 return 0 def validate_dependencies(self) -> None: if shutil.which("pandoc"): self.pass_("pandoc is available") else: self.fail("pandoc is missing; DOCX and HTML export will fail") browser = find_browser() if browser: self.pass_(f"Chromium-family browser is available: {browser}") else: self.warn("No Chrome/Edge/Brave/Chromium browser found; screenshots and PDF fallback may fail") if can_import("PIL"): self.pass_("Pillow is available for exact screenshot cropping") else: self.warn("Pillow is missing; screenshot cropping cannot be validated or performed exactly") if can_import("weasyprint"): self.pass_("WeasyPrint is available for clean PDF export") else: self.warn("WeasyPrint is missing; PDF export will rely on browser fallback") if can_import("docx"): self.pass_("python-docx is available for default Word reference generation") else: self.warn("python-docx is missing; default Word reference style generation may be skipped") def validate_package_files(self) -> None: required = [ "brief.json", "tutorial.md", "outline.md", "research/evidence-map.md", "research/chapter-quality-review.md", "visuals/visual-spec.json", "visuals/index.html", ] for relative in required: path = self.package_dir / relative if path.exists(): self.pass_(f"Found {relative}") else: self.fail(f"Missing {relative}") source_register = self.package_dir / "research/source-register.md" user_register = self.package_dir / "research/user-materials-register.md" if source_register.exists() or user_register.exists(): self.pass_("Found at least one research register") else: self.warn("No source register or user material register found") exports_dir = self.package_dir / "exports" if "html" in self.formats: self.require_file(exports_dir / f"{self.basename}.html", "HTML export") if "docx" in self.formats: self.require_file(exports_dir / f"{self.basename}.docx", "DOCX export") if "pdf" in self.formats: self.require_file(exports_dir / f"{self.basename}.pdf", "PDF export") def require_file(self, path: Path, label: str) -> None: if path.exists() and path.stat().st_size > 0: self.pass_(f"{label} exists") elif path.exists(): self.fail(f"{label} exists but is empty: {path.relative_to(self.package_dir)}") else: self.fail(f"{label} missing: {path.relative_to(self.package_dir)}") def validate_markdown(self) -> None: tutorial = self.package_dir / "tutorial.md" if not tutorial.exists(): return text = read_text(tutorial) h1_count = len(re.findall(r"^#\s+", text, flags=re.M)) if h1_count == 1: self.pass_("tutorial.md has one H1 title") else: self.warn(f"tutorial.md should have one H1 title; found {h1_count}") chapter_matches = list(re.finditer(r"^##\s+第\s*(\d+)\s*章\b", text, flags=re.M)) chapters = [match.group(0) for match in chapter_matches] self.chapter_numbers = [int(match.group(1)) for match in chapter_matches] self.chapter_sections = split_chapter_sections(text, chapter_matches) if chapters: self.pass_(f"Found {len(chapters)} numbered chapters") else: self.fail("No numbered chapter headings found; expected H2 headings like '## 第1章 标题'") h3_headings = re.findall(r"^###\s+(.+)$", text, flags=re.M) non_decimal = [heading for heading in h3_headings if not re.match(r"\d+\.\d+\s+", heading)] if h3_headings and not non_decimal: self.pass_("All H3 headings use decimal numbering") elif non_decimal: self.warn(f"{len(non_decimal)} H3 headings do not use decimal numbering") images = markdown_images(text) self.markdown_image_refs = images if images: missing = [image for image in images if is_local_image(image) and not (self.package_dir / image).exists()] if missing: self.fail(f"Missing {len(missing)} local markdown images") else: self.pass_(f"All {len(images)} markdown image references resolve or are remote") else: self.warn("No markdown images found; each chapter should normally include a visual") source_ids = public_source_markers(text) if source_ids: self.fail( "tutorial.md exposes internal source markers: " + ", ".join(sorted(set(source_ids))[:8]) ) else: self.pass_("tutorial.md contains no public bracket source markers") self.validate_specific_h3_headings(h3_headings) self.validate_chapter_depth_consistency() self.validate_chapter_quality_review() def validate_visuals(self) -> None: spec_path = self.package_dir / "visuals/visual-spec.json" if not spec_path.exists(): return try: spec = json.loads(spec_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: self.fail(f"visual-spec.json is invalid JSON: {exc}") return chapters = spec.get("chapters") or spec.get("modules") or [] if not isinstance(chapters, list) or not chapters: self.fail("visual-spec.json must contain a non-empty chapters array") return self.pass_(f"visual-spec.json contains {len(chapters)} chapter visual specs") visual_ids = { str(chapter.get("id") or f"chapter-{index:02d}") for index, chapter in enumerate(chapters, start=1) if isinstance(chapter, dict) } self.validate_chapter_visual_coverage(chapters, visual_ids) for index, chapter in enumerate(chapters, start=1): if not isinstance(chapter, dict): self.fail(f"Visual spec item {index} is not an object") continue visual_id = str(chapter.get("id") or f"chapter-{index:02d}") if not chapter.get("title"): self.warn(f"{visual_id} has no title") if not chapter.get("caption"): self.warn(f"{visual_id} has no caption") svg = self.package_dir / "visuals" / f"{visual_id}.svg" png = self.package_dir / "assets/screenshots" / f"{visual_id}.png" if svg.exists(): self.pass_(f"SVG exists for {visual_id}") else: self.fail(f"Missing SVG for {visual_id}") if png.exists(): self.pass_(f"PNG screenshot exists for {visual_id}") self.validate_png_size(png, visual_id) else: self.warn(f"Missing PNG screenshot for {visual_id}; SVG fallback may be used") def validate_chapter_visual_coverage(self, chapters: list[object], visual_ids: set[str]) -> None: if not self.chapter_numbers: return expected_ids = [chapter_visual_id(number) for number in self.chapter_numbers] missing_specs = [visual_id for visual_id in expected_ids if visual_id not in visual_ids] extra_specs = sorted(visual_ids - set(expected_ids)) if len(chapters) == len(self.chapter_numbers): self.pass_("Visual spec count matches numbered chapter count") else: self.fail( f"Visual spec count does not match numbered chapters: " f"{len(chapters)} specs for {len(self.chapter_numbers)} chapters" ) if missing_specs: self.fail(f"Missing visual specs for chapters: {', '.join(missing_specs)}") else: self.pass_("Every numbered chapter has a matching visual spec ID") if extra_specs: self.warn(f"Visual spec has extra IDs not matched to numbered chapters: {', '.join(extra_specs)}") embedded_ids = local_image_stems(self.markdown_image_refs) missing_embeds = [visual_id for visual_id in expected_ids if visual_id not in embedded_ids] if missing_embeds: self.fail(f"Tutorial markdown is missing embedded visuals for: {', '.join(missing_embeds)}") else: self.pass_("Every numbered chapter has an embedded visual reference") def validate_png_size(self, png: Path, visual_id: str) -> None: try: from PIL import Image except ImportError: return try: with Image.open(png) as image: width, height = image.size except Exception as exc: self.fail(f"Cannot open PNG for {visual_id}: {exc}") return ratio = width / height if height else 0 if width >= 1600 and 1.65 <= ratio <= 1.9: self.pass_(f"PNG for {visual_id} is high-resolution 16:9-ish ({width}x{height})") else: self.warn(f"PNG for {visual_id} may be low resolution or wrong aspect ratio ({width}x{height})") def validate_html(self) -> None: if "html" not in self.formats: return html_path = self.package_dir / "exports" / f"{self.basename}.html" if not html_path.exists(): return html = read_text(html_path) if 'class="report-shell"' in html and 'class="article-body"' in html: self.pass_("HTML has centered report-shell and article-body layout") else: self.fail("HTML missing report-shell/article-body layout wrapper") if 'id="TOC"' in html: self.pass_("HTML has a generated TOC") else: self.warn("HTML has no nav#TOC; long tutorials should include a sticky anchor menu") if 'class="doc-date"' in html: self.pass_("HTML has document date below title") else: self.warn("HTML has no document date line") h1_count = len(re.findall(r" None: if "docx" not in self.formats: return docx_path = self.package_dir / "exports" / f"{self.basename}.docx" if not docx_path.exists(): return try: with zipfile.ZipFile(docx_path, "r") as archive: names = archive.namelist() header_footer_files = [ name for name in names if name.startswith("word/header") or name.startswith("word/footer") ] document_xml = archive.read("word/document.xml").decode("utf-8", errors="ignore") except Exception as exc: self.fail(f"Cannot inspect DOCX: {exc}") return if header_footer_files: self.fail(f"DOCX still contains header/footer files: {', '.join(header_footer_files[:4])}") elif "headerReference" in document_xml or "footerReference" in document_xml: self.fail("DOCX document.xml still references a header or footer") else: self.pass_("DOCX has no header/footer parts or references") if "/Users/" in document_xml or "file:///" in document_xml: self.fail("DOCX document XML contains a local absolute path") else: self.pass_("DOCX document XML contains no local absolute paths") def validate_pdf(self) -> None: if "pdf" not in self.formats: return pdf_path = self.package_dir / "exports" / f"{self.basename}.pdf" if not pdf_path.exists(): return size = pdf_path.stat().st_size if size > 10_000: self.pass_(f"PDF exists and is non-trivial ({size} bytes)") else: self.warn(f"PDF exists but is very small ({size} bytes)") def validate_no_absolute_paths(self) -> None: text_files = [ self.package_dir / "tutorial.md", self.package_dir / "exports" / f"{self.basename}.html", ] leaks: list[str] = [] for path in text_files: if not path.exists(): continue text = read_text(path) if "file:///" in text or "/Users/" in text: leaks.append(str(path.relative_to(self.package_dir))) if leaks: self.fail(f"Local absolute paths found in final text outputs: {', '.join(leaks)}") else: self.pass_("No local absolute paths found in final text outputs") def validate_specific_h3_headings(self, h3_headings: list[str]) -> None: if not h3_headings: return generic_labels = { "你要做的事", "你要注意什么", "示例", "检查点", "小结", "练习", "what to notice", "example", "checkpoint", "summary", } repeated: dict[str, int] = {} for heading in h3_headings: label = re.sub(r"^\d+\.\d+\s+", "", heading).strip() label = re.sub(r"[:: ].*$", "", label).strip() if label.lower() in generic_labels: repeated[label] = repeated.get(label, 0) + 1 offenders = [f"{label} x{count}" for label, count in repeated.items() if count > 1] if offenders: self.fail( "H3 headings repeat generic labels instead of outline-specific section titles: " + ", ".join(offenders) ) else: self.pass_("H3 headings are specific, not repeated generic labels") def validate_chapter_depth_consistency(self) -> None: if len(self.chapter_sections) < 4: return chapter_lengths = [ (number, visible_text_length(section)) for number, _heading, section in self.chapter_sections ] first_half = chapter_lengths[: max(1, len(chapter_lengths) // 2)] second_half = chapter_lengths[len(chapter_lengths) // 2 :] first_median = median([length for _number, length in first_half]) second_median = median([length for _number, length in second_half]) thin_chapters = [ f"第{number}章 ({length})" for number, length in chapter_lengths if first_median and length < first_median * 0.45 ] if second_median and first_median and second_median < first_median * 0.65: self.warn( "Later chapters appear much thinner than early chapters; " f"first-half median {first_median}, second-half median {second_median}" ) elif thin_chapters: self.warn("Some chapters may be too thin for a full tutorial: " + ", ".join(thin_chapters[:6])) else: self.pass_("Chapter depth is reasonably balanced across the tutorial") def validate_chapter_quality_review(self) -> None: review = self.package_dir / "research/chapter-quality-review.md" if not review.exists(): self.fail("Missing research/chapter-quality-review.md for per-chapter quality control") return text = read_text(review) missing = [ number for number in self.chapter_numbers if not re.search(rf"(第\s*{number}\s*章|chapter-0*{number}\b)", text, flags=re.I) ] if missing: self.fail( "chapter-quality-review.md is missing review rows for chapters: " + ", ".join(f"第{number}章" for number in missing) ) else: self.pass_("chapter-quality-review.md covers every numbered chapter") required_terms = [ "learner_question", "depth_status", "example_or_case", "practice_or_checkpoint", "visual_fit", ] missing_terms = [term for term in required_terms if term not in text] if missing_terms: self.warn("chapter-quality-review.md may be missing expected columns: " + ", ".join(missing_terms)) else: self.pass_("chapter-quality-review.md includes depth, example, practice, and visual review columns") def validate_public_provenance_hygiene(self) -> None: public_texts: list[tuple[str, str]] = [] tutorial = self.package_dir / "tutorial.md" html_path = self.package_dir / "exports" / f"{self.basename}.html" docx_path = self.package_dir / "exports" / f"{self.basename}.docx" pdf_path = self.package_dir / "exports" / f"{self.basename}.pdf" if tutorial.exists(): public_texts.append(("tutorial.md", read_text(tutorial))) if html_path.exists(): public_texts.append((f"exports/{self.basename}.html", read_text(html_path))) if docx_path.exists(): public_texts.append((f"exports/{self.basename}.docx", docx_visible_text(docx_path))) if pdf_path.exists() and shutil.which("pdftotext"): text = extract_pdf_text(pdf_path) if text: public_texts.append((f"exports/{self.basename}.pdf", text)) marker_leaks = [] provenance_leaks = [] for label, text in public_texts: markers = sorted(set(public_source_markers(text))) if markers: marker_leaks.append(f"{label}: {', '.join(markers[:8])}") phrases = public_provenance_phrases(text) if phrases: provenance_leaks.append(f"{label}: {', '.join(phrases[:6])}") if marker_leaks: self.fail("Public outputs expose internal source markers: " + "; ".join(marker_leaks)) else: self.pass_("Public outputs contain no internal bracket source markers") if provenance_leaks: self.fail("Public outputs expose internal provenance wording: " + "; ".join(provenance_leaks)) else: self.pass_("Public outputs do not describe themselves as based on supplied material") def can_import(module: str) -> bool: try: __import__(module) except Exception: return False return True def find_browser() -> str: candidates = [ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge", "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser", "/Applications/Chromium.app/Contents/MacOS/Chromium", ] for candidate in candidates: if os.access(candidate, os.X_OK): return candidate for name in ["google-chrome", "chromium", "chromium-browser", "msedge", "brave"]: path = shutil.which(name) if path: return path return "" def read_text(path: Path) -> str: return path.read_text(encoding="utf-8", errors="ignore") def split_chapter_sections(markdown: str, chapter_matches: list[re.Match[str]]) -> list[tuple[int, str, str]]: sections: list[tuple[int, str, str]] = [] for index, match in enumerate(chapter_matches): start = match.start() end = chapter_matches[index + 1].start() if index + 1 < len(chapter_matches) else len(markdown) number = int(match.group(1)) heading = match.group(0) sections.append((number, heading, markdown[start:end])) return sections def visible_text_length(markdown: str) -> int: text = re.sub(r"```.*?```", " ", markdown, flags=re.S) text = re.sub(r"!\[[^\]]*\]\([^)]+\)", " ", text) text = re.sub(r"\[[^\]]+\]\([^)]+\)", " ", text) text = re.sub(r"^#+\s+", " ", text, flags=re.M) text = re.sub(r"[|`*_>#\-\[\]():]", " ", text) text = re.sub(r"\s+", "", text) return len(text) def median(values: list[int]) -> int: if not values: return 0 ordered = sorted(values) mid = len(ordered) // 2 if len(ordered) % 2: return ordered[mid] return (ordered[mid - 1] + ordered[mid]) // 2 def public_source_markers(text: str) -> list[str]: return re.findall(r"\[(?:U|P|G|A|X|L)\d+\]", text) def public_provenance_phrases(text: str) -> list[str]: patterns = [ r"用户粘贴", r"用户提供", r"用户给(?:到|的)", r"基于[^。\n]{0,30}(?:文章|原文|资料|材料)", r"根据[^。\n]{0,30}(?:文章|原文|资料|材料)(?:整理|生成|改写)", r"参考原文", r"原文(?:中|里|作者)", r"X Article", r"user-supplied", r"pasted (?:article|note|material)", r"based on (?:the )?(?:article|source|supplied material|original text)", ] found: list[str] = [] for pattern in patterns: match = re.search(pattern, text, flags=re.I) if match: found.append(match.group(0)) return found def docx_visible_text(path: Path) -> str: try: with zipfile.ZipFile(path, "r") as archive: chunks = [] for name in archive.namelist(): if name == "word/document.xml" or name.startswith("word/header") or name.startswith("word/footer"): chunks.append(archive.read(name).decode("utf-8", errors="ignore")) return "\n".join(chunks) except Exception: return "" def extract_pdf_text(path: Path) -> str: import subprocess try: completed = subprocess.run( ["pdftotext", str(path), "-"], check=False, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, timeout=30, ) except Exception: return "" return completed.stdout if completed.returncode == 0 else "" def markdown_images(markdown: str) -> list[str]: return [ normalize_markdown_target(match.group(2)) for match in re.finditer(r"!\[([^\]]*)\]\(([^)]+)\)", markdown) ] def normalize_markdown_target(target: str) -> str: target = target.strip() if target.startswith("<"): end = target.find(">") if end != -1: return target[1:end].strip() return target.split()[0].strip("\"'") def is_local_image(path: str) -> bool: return not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:", path) def local_image_stems(paths: list[str]) -> set[str]: stems: set[str] = set() for path in paths: if not is_local_image(path): continue clean_path = path.split("#", 1)[0].split("?", 1)[0] stems.add(Path(clean_path).stem) return stems def chapter_visual_id(number: int) -> str: return f"chapter-{number:02d}" def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Validate a generated yao-tutorial-skill output package.") parser.add_argument("package_dir", help="Generated tutorial package directory, usually outputs/yao-tutorials/.") parser.add_argument("--basename", default="tutorial", help="Base export filename. Defaults to tutorial.") parser.add_argument( "--formats", nargs="+", choices=["docx", "html", "pdf"], default=["docx", "html", "pdf"], help="Expected export formats.", ) parser.add_argument("--check-deps", action="store_true", help="Also check local export dependencies.") parser.add_argument("--strict", action="store_true", help="Return a non-zero exit code when warnings exist.") return parser.parse_args() def main() -> int: args = parse_args() validator = Validator( package_dir=Path(args.package_dir).resolve(), basename=args.basename, formats=args.formats, check_deps=args.check_deps, strict=args.strict, ) return validator.run() if __name__ == "__main__": raise SystemExit(main())