From ace341a03b2b1fbb60ad9e23a41893b2ee095b79 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Sat, 25 Apr 2026 03:20:51 +0000 Subject: [PATCH 1/4] [INFRA] Surface javadoc crash culprit in unidoc failure output Unidoc currently fails with ~100 `[error]` lines on genjavadoc-generated Java stubs under `target/java/...` (private Scala case-class `apply` methods that produce invalid `static public abstract R apply(T1, ...)` Java). These errors are benign -- every PR emits them -- but they overshadow the real cause when javadoc hard-exits mid-stream on specific doc-comment content. The actual crash signal is the last `Generating .../.html...` line before `javadoc exited with exit code 1`, which a developer has to hunt for by hand in multi-thousand-line CI logs. Tee sbt output to `target/unidoc-build.log` and, on failure, print a framed banner with: - the HTML file javadoc was generating when it died, - the inferred source class to audit, - a one-paragraph hint about the usual scaladoc triggers (wiki-style `[[...]]` links, inline-backtick code refs), - an explicit note that the `[error]` lines on `target/java/...` stubs are not the cause. Heuristic only; when the log doesn't match the mid-HTML-crash pattern (e.g. scaladoc failure, sbt env issue) the banner says so and points back to the full log above. Co-authored-by: Isaac --- docs/_plugins/build_api_docs.rb | 84 ++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/docs/_plugins/build_api_docs.rb b/docs/_plugins/build_api_docs.rb index 83d812069dfc4..38195201272b6 100644 --- a/docs/_plugins/build_api_docs.rb +++ b/docs/_plugins/build_api_docs.rb @@ -133,7 +133,89 @@ def build_spark_scala_and_java_docs_if_necessary command = "build/sbt -Pkinesis-asl unidoc" puts "Running '#{command}'..." - system(command) || raise("Unidoc generation failed") + # Tee sbt output to a log file so we can diagnose failures. The most common + # unidoc failure is a javadoc crash mid-stream while generating HTML for a + # specific class, buried under ~100 benign errors on genjavadoc-generated + # Java stubs (e.g. target/java/org/apache/spark/ErrorInfo.java). Without the + # diagnostic below, the real culprit -- the source whose doc tripped javadoc + # -- is effectively invisible in the CI log. + log_file = File.join(SPARK_PROJECT_ROOT, "target", "unidoc-build.log") + mkdir_p(File.dirname(log_file)) + success = stream_and_capture(command, log_file) + unless success + diagnose_unidoc_failure(log_file) + raise("Unidoc generation failed") + end +end + +# Runs `command`, streaming every line to both stdout and `log_file`. Returns +# true iff the command exited 0. Ruby-only; no shell pipefail reliance. +def stream_and_capture(command, log_file) + File.open(log_file, 'w') do |f| + IO.popen("#{command} 2>&1", 'r') do |pipe| + pipe.each_line do |line| + $stdout.write(line) + $stdout.flush + f.write(line) + end + end + end + $?.success? +end + +# Scans the captured unidoc log and prints a pointer to the most likely +# culprit source file. The heuristic: when javadoc dies mid-HTML-generation, +# the last "Generating .../X.html" line before "javadoc exited with exit code" +# names the class that tripped it. Prints nothing actionable if the failure +# mode doesn't match (e.g. a scaladoc error), in which case the full log above +# already shows what's wrong. +def diagnose_unidoc_failure(log_file) + return unless File.exist?(log_file) + lines = File.readlines(log_file) + + javadoc_exit_idx = lines.rindex { |l| l.include?("javadoc exited with exit code") } + last_generating = nil + if javadoc_exit_idx + # Strip ANSI color codes so the regex matches sbt-coloured output too. + ansi = /\e\[[0-9;]*[A-Za-z]/ + lines[0...javadoc_exit_idx].reverse_each do |line| + if line.gsub(ansi, '') =~ %r{Generating .+/javaunidoc/(\S+?\.html)\.\.\.} + last_generating = $1 + break + end + end + end + + banner = "=" * 78 + $stderr.puts "" + $stderr.puts banner + $stderr.puts "Unidoc failed -- diagnostic summary" + $stderr.puts banner + if last_generating + class_path = last_generating.sub(/\.html$/, '') + class_name = class_path.tr('/', '.') + $stderr.puts "" + $stderr.puts " Javadoc crashed while generating: #{last_generating}" + $stderr.puts " Likely culprit: doc comment in #{class_name}" + $stderr.puts "" + $stderr.puts " Javadoc can hard-exit (not just warn) on specific scaladoc" + $stderr.puts " patterns once they have been passed through genjavadoc --" + $stderr.puts " wiki-style `[[Class]]` / `[[method]]` links or inline-backticked" + $stderr.puts " code refs in the Scala source for the class above are common" + $stderr.puts " triggers. Start by auditing any recent doc-string changes in" + $stderr.puts " that source file." + $stderr.puts "" + $stderr.puts " NOTE: the 100 '[error]' lines above, all on files under" + $stderr.puts " target/java/..., are benign genjavadoc stubs -- every PR" + $stderr.puts " emits them and they do not cause the exit. Ignore them." + else + $stderr.puts "" + $stderr.puts " Could not locate a 'javadoc exited with exit code' marker in" + $stderr.puts " the log; the failure is likely outside the javaunidoc step" + $stderr.puts " (scaladoc / sbt / build env). See the full sbt output above." + end + $stderr.puts banner + $stderr.puts "" end def build_scala_and_java_docs From a4b30e83a3da9eaab99bc3d6631732f8a3fff9cc Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Sat, 25 Apr 2026 03:20:59 +0000 Subject: [PATCH 2/4] DO NOT MERGE: break a docstring to validate the unidoc diagnostic Intentionally reintroduces the scaladoc pattern that hard-exited javadoc on PR #51419 (wiki-style [[TableIdentifier]] / [[toQualifiedNameParts]] refs plus backtick-inline `Seq[String]`) in CatalogV2Implicits.IdentifierHelper. CI should fail at the unidoc step and the new diagnostic banner should name this class as the culprit. Drop this commit before merging. Co-authored-by: Isaac --- .../spark/sql/connector/catalog/CatalogV2Implicits.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala index cf6052009c927..a5e5d1fe391e7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala @@ -172,6 +172,11 @@ private[sql] object CatalogV2Implicits { } /** + * DO NOT MERGE -- intentional javadoc-crash bait to verify CI diagnostic. + * Reproduces the exact doc pattern that hard-exited javadoc on PR #51419: + * a wiki-style link to [[TableIdentifier]], a forward ref to + * [[toQualifiedNameParts]], and an inline-backticked `Seq[String]`. + * * Tries to convert catalog identifier to the table identifier. Table identifier does not * support multiple namespaces (nested namespaces), so if identifier contains nested namespace, * conversion cannot be done From 1169340209a137cb70a211f166b8fbcdeb0cf187 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Sat, 25 Apr 2026 03:57:36 +0000 Subject: [PATCH 3/4] [INFRA] Address review: differentiate exit-marker branches, shield diagnostic helper, drop magic 100 --- docs/_plugins/build_api_docs.rb | 90 +++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/docs/_plugins/build_api_docs.rb b/docs/_plugins/build_api_docs.rb index 38195201272b6..e6719c4bed7e3 100644 --- a/docs/_plugins/build_api_docs.rb +++ b/docs/_plugins/build_api_docs.rb @@ -171,51 +171,63 @@ def stream_and_capture(command, log_file) # already shows what's wrong. def diagnose_unidoc_failure(log_file) return unless File.exist?(log_file) - lines = File.readlines(log_file) - - javadoc_exit_idx = lines.rindex { |l| l.include?("javadoc exited with exit code") } - last_generating = nil - if javadoc_exit_idx - # Strip ANSI color codes so the regex matches sbt-coloured output too. - ansi = /\e\[[0-9;]*[A-Za-z]/ - lines[0...javadoc_exit_idx].reverse_each do |line| - if line.gsub(ansi, '') =~ %r{Generating .+/javaunidoc/(\S+?\.html)\.\.\.} - last_generating = $1 - break + begin + lines = File.readlines(log_file) + + javadoc_exit_idx = lines.rindex { |l| l.include?("javadoc exited with exit code") } + last_generating = nil + if javadoc_exit_idx + # Strip ANSI color codes so the regex matches sbt-coloured output too. + ansi = /\e\[[0-9;]*[A-Za-z]/ + lines[0...javadoc_exit_idx].reverse_each do |line| + if line.gsub(ansi, '') =~ %r{Generating .+/javaunidoc/(\S+?\.html)\.\.\.} + last_generating = $1 + break + end end end - end - banner = "=" * 78 - $stderr.puts "" - $stderr.puts banner - $stderr.puts "Unidoc failed -- diagnostic summary" - $stderr.puts banner - if last_generating - class_path = last_generating.sub(/\.html$/, '') - class_name = class_path.tr('/', '.') - $stderr.puts "" - $stderr.puts " Javadoc crashed while generating: #{last_generating}" - $stderr.puts " Likely culprit: doc comment in #{class_name}" + banner = "=" * 78 $stderr.puts "" - $stderr.puts " Javadoc can hard-exit (not just warn) on specific scaladoc" - $stderr.puts " patterns once they have been passed through genjavadoc --" - $stderr.puts " wiki-style `[[Class]]` / `[[method]]` links or inline-backticked" - $stderr.puts " code refs in the Scala source for the class above are common" - $stderr.puts " triggers. Start by auditing any recent doc-string changes in" - $stderr.puts " that source file." - $stderr.puts "" - $stderr.puts " NOTE: the 100 '[error]' lines above, all on files under" - $stderr.puts " target/java/..., are benign genjavadoc stubs -- every PR" - $stderr.puts " emits them and they do not cause the exit. Ignore them." - else + $stderr.puts banner + $stderr.puts "Unidoc failed -- diagnostic summary" + $stderr.puts banner + if last_generating + class_path = last_generating.sub(/\.html$/, '') + class_name = class_path.tr('/', '.') + $stderr.puts "" + $stderr.puts " Javadoc crashed while generating: #{last_generating}" + $stderr.puts " Likely culprit: doc comment in #{class_name}" + $stderr.puts "" + $stderr.puts " Javadoc can hard-exit (not just warn) on specific scaladoc" + $stderr.puts " patterns once they have been passed through genjavadoc --" + $stderr.puts " wiki-style `[[Class]]` / `[[method]]` links or inline-backticked" + $stderr.puts " code refs in the Scala source for the class above are common" + $stderr.puts " triggers. Start by auditing any recent doc-string changes in" + $stderr.puts " that source file." + $stderr.puts "" + $stderr.puts " NOTE: the '[error]' lines above on files under" + $stderr.puts " target/java/... are benign genjavadoc stubs -- every PR" + $stderr.puts " emits them and they do not cause the exit. Ignore them." + elsif javadoc_exit_idx + $stderr.puts "" + $stderr.puts " Javadoc exited but no class HTML generation was in progress;" + $stderr.puts " the crash predates HTML output -- likely a CLI / classpath /" + $stderr.puts " setup issue. See the full sbt output above." + else + $stderr.puts "" + $stderr.puts " Could not locate a 'javadoc exited with exit code' marker in" + $stderr.puts " the log; the failure is likely outside the javaunidoc step" + $stderr.puts " (scaladoc / sbt / build env). See the full sbt output above." + end + $stderr.puts banner $stderr.puts "" - $stderr.puts " Could not locate a 'javadoc exited with exit code' marker in" - $stderr.puts " the log; the failure is likely outside the javaunidoc step" - $stderr.puts " (scaladoc / sbt / build env). See the full sbt output above." + rescue => e + # Never let the diagnostic helper itself obscure the underlying unidoc + # failure: if anything here goes wrong (e.g. encoding error reading the + # log), report it briefly and let the caller raise the real error. + $stderr.puts "(diagnostic helper failed: #{e.class}: #{e.message})" end - $stderr.puts banner - $stderr.puts "" end def build_scala_and_java_docs From e9ac5a9a5f79edc535d21e8e1e17ae177313cd73 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Sat, 25 Apr 2026 09:56:26 +0000 Subject: [PATCH 4/4] Revert "DO NOT MERGE: break a docstring to validate the unidoc diagnostic" This reverts commit a4b30e83a3da9eaab99bc3d6631732f8a3fff9cc. --- .../spark/sql/connector/catalog/CatalogV2Implicits.scala | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala index a5e5d1fe391e7..cf6052009c927 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala @@ -172,11 +172,6 @@ private[sql] object CatalogV2Implicits { } /** - * DO NOT MERGE -- intentional javadoc-crash bait to verify CI diagnostic. - * Reproduces the exact doc pattern that hard-exited javadoc on PR #51419: - * a wiki-style link to [[TableIdentifier]], a forward ref to - * [[toQualifiedNameParts]], and an inline-backticked `Seq[String]`. - * * Tries to convert catalog identifier to the table identifier. Table identifier does not * support multiple namespaces (nested namespaces), so if identifier contains nested namespace, * conversion cannot be done