Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ RUN if [ ! -z "${RHOSO_DOCS_GIT_URL}" ]; then \
fi

# -- Stage 1c: Generate OCP plaintext formatted documentation ----------

# Use the right CPU/GPU image or it will break the embedding stage as we replace the venv directory
FROM quay.io/lightspeed-core/rag-content-${FLAVOR}:latest as docs-base-ocp

Expand Down Expand Up @@ -100,12 +101,38 @@ RUN if [[ "${BUILD_OCP_DOCS}" == "true" ]]; then \
mkdir -p /rag-content/ocp-product-docs-plaintext; \
fi

# -- Stage 1d: Generate OpenStack Operators plaintext formatted documentation ----------
FROM registry.access.redhat.com/ubi9/python-311 as docs-base-operators

ARG BUILD_OPERATORS_DOCS=false
ARG OPERATORS_REPO_URL=https://github.com/openstack-k8s-operators/openstack-operator.git
ARG OPERATORS_BRANCH=main

ENV OPERATORS_REPO_URL=$OPERATORS_REPO_URL
ENV OPERATORS_BRANCH=$OPERATORS_BRANCH

USER 0
WORKDIR /rag-content

COPY ./scripts ./scripts

# install asciidoctor and html2text for converting AsciiDoc to plaintext
RUN if [ "$BUILD_OPERATORS_DOCS" = "true" ]; then \
dnf install -y ruby python3-pip && \
gem install asciidoctor && \
pip install html2text && \
./scripts/get_openstack_operators_docs.sh; \
fi


# -- Stage 2: Compute embeddings for the doc chunks ---------------------------
FROM quay.io/lightspeed-core/rag-content-${FLAVOR}:latest as lightspeed-core-rag-builder
COPY --from=docs-base-upstream /rag-content /rag-content
COPY --from=docs-base-downstream /rag-content /rag-content
# Limit what we copy to make it faster
COPY --from=docs-base-ocp /rag-content/ocp-product-docs-plaintext /rag-content/ocp-product-docs-plaintext
COPY --from=docs-base-operators /rag-content/openstack-operators-docs-plaintext /rag-content/openstack-operators-docs-plaintext


ARG FLAVOR=cpu
ARG BUILD_UPSTREAM_DOCS=true
Expand All @@ -116,6 +143,7 @@ ARG NUM_WORKERS=1
ARG RHOSO_DOCS_GIT_URL=""
ARG VECTOR_DB_TYPE="faiss"
ARG BUILD_OKP_CONTENT=false
ARG BUILD_OPERATORS_DOCS=true
ARG OKP_CONTENT="all"
ARG HERMETIC=false

Expand All @@ -137,6 +165,9 @@ RUN if [ "$FLAVOR" == "gpu" ]; then \
if [ "$BUILD_OKP_CONTENT" = "true" ]; then \
FOLDER_ARG="$FOLDER_ARG --okp-folder ./okp-content --okp-content ${OKP_CONTENT}"; \
fi && \
if [ "$BUILD_OPERATORS_DOCS" = "true" ]; then \
FOLDER_ARG="$FOLDER_ARG --operators-folder openstack-operators-docs-plaintext"; \
fi && \
python ./scripts/generate_embeddings_openstack.py \
--output ./vector_db/ \
--model-dir embeddings_model \
Expand Down
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ BUILD_OKP_CONTENT ?= false
OKP_CONTENT ?= "all"
RHOSO_REMAP_TITLES ?= {}
RHOSO_EXCLUDE_TITLES ?= ""
BUILD_OPERATORS_DOCS ?= false
OPERATORS_REPO_URL ?= https://github.com/openstack-k8s-operators/openstack-operator.git
OPERATORS_BRANCH ?= main

HERMETIC ?= false

Expand Down Expand Up @@ -65,6 +68,9 @@ build-image-os: ## Build a openstack rag-content container image
--build-arg OCP_VERSIONS=$(OCP_VERSIONS) \
--build-arg OLS_DOC_REPO=$(OLS_DOC_REPO) \
--build-arg HERMETIC=$(HERMETIC) \
--build-arg BUILD_OPERATORS_DOCS=$(BUILD_OPERATORS_DOCS) \
--build-arg OPERATORS_REPO_URL=$(OPERATORS_REPO_URL) \
--build-arg OPERATORS_BRANCH=$(OPERATORS_BRANCH) \
$(BUILD_GPU_ARGS) .

get-embeddings-model: ## Download embeddings model from the openstack-lightspeed/rag-content container image
Expand Down
54 changes: 52 additions & 2 deletions scripts/generate_embeddings_openstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,38 @@ def url_function(self, file_path: str):
)


#
# Functions related to Openstack Operators
#

class OpenStackOperatorMetadataProcessor(MetadataProcessor):
"""Metadata processor for OpenStack OpenShift Operators Documentation"""

def __init__(self, folder_path: str):
super(OpenStackOperatorMetadataProcessor, self).__init__()
self.folder_path = Path(folder_path)
self.base_url = "https://openstack-k8s-operators.github.io/openstack-operator"

def url_function(self, path: str) -> str:
"""Generate the URL for a document based on its file path."""
path_obj = Path(path).resolve()
try:
relative_path = path_obj.relative_to(self.folder_path.resolve())
except ValueError:
relative_path = path_obj.name

relative_path = relative_path.as_posix()

# Replace .txt with / for dir-style URLs
# ctlplane/index.txt -> ctlplane/
# dataplane/index.txt -> dataplane/
relative_path = relative_path.replace("/index.txt", "/")

# For other files, replace .txt with .html
relative_path = relative_path.replace(".txt", ".html")

return f"{self.base_url}/{relative_path}"

#
# Functions related to OpenStack OKP
#
Expand Down Expand Up @@ -184,6 +216,13 @@ def copy_openstack_documentation(
required=False,
help="Directory containing the plain text RHOSO documentation",
)
parser.add_argument(
"-opf",
"--operators-folder",
type=Path,
required=False,
help="Directory containing the plain text OpenStack Operators documentation",
)
parser.add_argument(
"-ua",
"--unreachable-action",
Expand Down Expand Up @@ -222,9 +261,9 @@ def copy_openstack_documentation(

args = parser.parse_args()

if not any([args.folder, args.rhoso_folder, args.okp_folder]):
if not any([args.folder, args.rhoso_folder, args.okp_folder, args.operators_folder]):
print(
'Error: Either the "--folder" and/or "--rhoso-folder" and/or "--okp-folder" options '
'Error: Either the "--folder" and/or "--rhoso-folder" and/or "--okp-folder" and/or "--operators-folder" options '
"must be provided",
file=sys.stderr,
)
Expand Down Expand Up @@ -267,6 +306,17 @@ def copy_openstack_documentation(
unreachable_action=args.unreachable_action,
)

# Process the OpenStack Operators document, if provided
if args.operators_folder:
document_processor.process(
str(args.operators_folder),
metadata=OpenStackOperatorMetadataProcessor(args.operators_folder),
required_exts=[
".txt",
],
unreachable_action=args.unreachable_action,
)

# Process the OKP files, if provided
okp_out_dir = None
if args.okp_folder:
Expand Down
124 changes: 124 additions & 0 deletions scripts/get_openstack_operators_docs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash
# Copyright 2025 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

set -eou pipefail
set -x

# The name of the output directory
OUTPUT_DIR_NAME=${OUTPUT_DIR_NAME:-openstack-operators-docs-plaintext}

# GitHub repository details
OPERATORS_REPO_URL=${OPERATORS_REPO_URL:-https://github.com/openstack-k8s-operators/openstack-operator.git}
OPERATORS_BRANCH=${OPERATORS_BRANCH:-main}

# Working directory
WORKING_DIR="${WORKING_DIR:-/tmp/os_operators_docs_temp}"

# Whether to delete files on success or not.
# Acceptable values are "all", "venv", in other cases they are not deleted
CLEAN_FILES="${CLEAN_FILES:-}"

# The current directory where the script was invoked
CURR_DIR=$(pwd)

echo "Fetching OpenStack operators documentation"

# Check if asciidoctor is available
if ! command -v asciidoctor &> /dev/null; then
echo "Error: 'asciidoctor' is not installed, please install it before continuing." >&2
exit 1
fi

# Check if html2text is available
if ! command -v html2text &> /dev/null; then
echo "Error: 'html2text' is not installed, please install it before continuing." >&2
exit 1
fi

mkdir -p "$WORKING_DIR"
cd "$WORKING_DIR"
echo "Working directory: $WORKING_DIR"

# Clone the repository if not present
if [ ! -d "openstack-operator" ]; then
git clone -v --depth=1 --single-branch -b "${OPERATORS_BRANCH}" "${OPERATORS_REPO_URL}"
fi

cd openstack-operator/docs

# Convert AsciiDoc files to plain text
echo "Converting AsciiDoc documentation to plain text..."

# Create output directory structure
mkdir -p "$WORKING_DIR/operators-docs-text/ctlplane"
mkdir -p "$WORKING_DIR/operators-docs-text/dataplane"

# Function to convert adoc to text
convert_adoc_to_text() {
local adoc_file=$1
local output_file=$2
local temp_html="${adoc_file%.adoc}.html"

# Convert AsciiDoc to HTML
asciidoctor "$adoc_file" -o "$temp_html"

# Convert HTML to plain text
html2text "$temp_html" utf8 > "$output_file"

# Clean up temporary HTML file
rm -f "$temp_html"
}

# Process ctlplane documentation
if [ -f "ctlplane.adoc" ]; then
echo "Converting ctlplane.adoc..."
convert_adoc_to_text "ctlplane.adoc" "$WORKING_DIR/operators-docs-text/ctlplane/index.txt"
fi

# Process dataplane documentation
if [ -f "dataplane.adoc" ]; then
echo "Converting dataplane.adoc..."
convert_adoc_to_text "dataplane.adoc" "$WORKING_DIR/operators-docs-text/dataplane/index.txt"
fi

# Process any additional adoc files in assemblies directory
if [ -d "assemblies" ]; then
echo "Processing assemblies directory..."
find assemblies -name "*.adoc" -type f | while read -r adoc_file; do
# Get relative path and convert to output path
rel_path="${adoc_file#assemblies/}"
output_path="$WORKING_DIR/operators-docs-text/assemblies/${rel_path%.adoc}.txt"
output_dir=$(dirname "$output_path")

mkdir -p "$output_dir"
echo "Converting $adoc_file..."
convert_adoc_to_text "$adoc_file" "$output_path"
done
fi

# Exit docs directory
cd "$WORKING_DIR"

# Copy to final output directory
rm -rf "$CURR_DIR/$OUTPUT_DIR_NAME"
cp -r "$WORKING_DIR/operators-docs-text" "$CURR_DIR/$OUTPUT_DIR_NAME"

# Remove artifacts if requested
if [ "${CLEAN_FILES}" == "all" ]; then
rm -rf "$WORKING_DIR"
fi

echo "Done. OpenStack operators documentation can be found at $CURR_DIR/$OUTPUT_DIR_NAME"