diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 4c5df380f6..6ddf2583c4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -5,7 +5,7 @@ }, "metadata": { "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server.", - "version": "26.04.00" + "version": "26.06.00" }, "plugins": [ { diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 5f34873671..e740506140 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "nvidia-cuopt-skills", "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server. Use when building or solving optimization with cuOpt.", - "version": "26.04.00", + "version": "26.06.00", "author": { "name": "NVIDIA" }, diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7958eac440..cdbf4df577 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,3 +1,6 @@ +# Default owner for paths with no later, more specific match +* @nvidia/cuopt-infra-codeowners + #cpp code owners cpp/ @nvidia/cuopt-engine-codeowners diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3eb1f1f066..a945cde8ec 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -45,7 +45,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -65,7 +65,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cuopt-mps-parser: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: wheel-publish-cuopt-mps-parser: needs: wheel-build-cuopt-mps-parser secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -99,7 +99,7 @@ jobs: wheel-build-libcuopt: needs: wheel-build-cuopt-mps-parser secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -112,7 +112,7 @@ jobs: wheel-publish-libcuopt: needs: wheel-build-libcuopt secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-build-cuopt: needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -135,7 +135,7 @@ jobs: wheel-publish-cuopt: needs: wheel-build-cuopt secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -145,7 +145,7 @@ jobs: package-type: python wheel-build-cuopt-server: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -160,7 +160,7 @@ jobs: wheel-publish-cuopt-server: needs: wheel-build-cuopt-server secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -171,7 +171,7 @@ jobs: docs-build: needs: [python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} node_type: "gpu-l4-latest-1" @@ -181,11 +181,11 @@ jobs: arch: "amd64" file_to_upload: "docs/cuopt/build/html/" artifact-name: "cuopt_docs" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-sh-client: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -201,7 +201,7 @@ jobs: wheel-publish-cuopt-sh-client: needs: wheel-build-cuopt-sh-client secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml index f8f7366e13..17d4e9ab57 100644 --- a/.github/workflows/build_test_publish_images.yaml +++ b/.github/workflows/build_test_publish_images.yaml @@ -55,7 +55,7 @@ jobs: compute-matrix: runs-on: ubuntu-latest container: - image: rapidsai/ci-conda:26.04-latest + image: rapidsai/ci-conda:26.06-latest outputs: MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 47a3bd9fca..a652c23b9a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -34,7 +34,7 @@ jobs: - wheel-build-cuopt-sh-client - test-self-hosted-server secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main if: always() with: needs: ${{ toJSON(needs) }} @@ -111,7 +111,7 @@ jobs: changed-files: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main with: files_yaml: | build_docs: @@ -279,20 +279,20 @@ jobs: - '!gemini-extension.json' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main with: enable_check_generated_files: false conda-cpp-build: needs: [checks, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: build_type: pull-request script: ci/build_cpp.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }} conda-cpp-tests: needs: [conda-cpp-build, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request @@ -308,14 +308,14 @@ jobs: conda-python-build: needs: [conda-cpp-build, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main with: build_type: pull-request script: ci/build_python.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} conda-python-tests: needs: [conda-python-build, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda with: run_codecov: false @@ -332,7 +332,7 @@ jobs: docs-build: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs with: build_type: pull-request @@ -340,12 +340,12 @@ jobs: arch: "amd64" file_to_upload: "docs/cuopt/build/html/" artifact-name: "cuopt_docs" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-mps-parser: needs: compute-matrix-filters secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt_mps_parser.sh @@ -357,7 +357,7 @@ jobs: wheel-build-libcuopt: needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }} @@ -368,7 +368,7 @@ jobs: wheel-build-cuopt: needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt.sh @@ -377,7 +377,7 @@ jobs: matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} wheel-tests-cuopt: needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request @@ -393,7 +393,7 @@ jobs: wheel-build-cuopt-server: needs: [checks, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt_server.sh @@ -405,7 +405,7 @@ jobs: wheel-build-cuopt-sh-client: needs: compute-matrix-filters secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt_sh_client.sh @@ -417,7 +417,7 @@ jobs: matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }} wheel-tests-cuopt-server: needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9ad7609e8a..a8cc5f2943 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -27,7 +27,7 @@ on: jobs: conda-cpp-tests: - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -42,7 +42,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-python-tests: - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: run_codecov: false build_type: ${{ inputs.build_type }} @@ -58,7 +58,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt: - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt-server: - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -97,5 +97,5 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: ci/test_notebooks.sh diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index d394b97db4..57b178740c 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -15,7 +15,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d03641fde..a935201f21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -117,7 +117,7 @@ Architecture: - Clone the repository: ```bash -CUOPT_HOME=$(pwd)/cuopt +export CUOPT_HOME=$(pwd)/cuopt git clone https://github.com/NVIDIA/cuopt.git $CUOPT_HOME cd $CUOPT_HOME ``` @@ -193,19 +193,20 @@ To build all libraries and tests, simply run To run the C++ tests, run ```bash -cd $CUOPT_HOME/datasets && get_test_data.sh +cd $CUOPT_HOME/datasets && ./get_test_data.sh cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh datasets/mip/download_miplib_test_dataset.sh export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/ -ctest --test-dir ${CUOPT_HOME}/cpp/build # libcuopt +ctest --test-dir ${CUOPT_HOME}/cpp/build -E L1TEST # libcuopt ``` +`L1TEST`s are excluded because they are expensive and not run as part of the typical development process. To run python tests, run - To run `cuopt` tests: ```bash -cd $CUOPT_HOME/datasets && get_test_data.sh +cd $CUOPT_HOME/datasets && ./get_test_data.sh cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh datasets/mip/download_miplib_test_dataset.sh export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/ diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH index d5ea6ced53..ba2906d066 100644 --- a/RAPIDS_BRANCH +++ b/RAPIDS_BRANCH @@ -1 +1 @@ -release/26.04 +main diff --git a/README.md b/README.md index 379a48c350..95c8598d77 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # cuOpt - GPU-accelerated Optimization [![Build Status](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml/badge.svg)](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml) -[![Version](https://img.shields.io/badge/version-26.04.00-blue)](https://github.com/NVIDIA/cuopt/releases) +[![Version](https://img.shields.io/badge/version-26.06.00-blue)](https://github.com/NVIDIA/cuopt/releases) [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://docs.nvidia.com/cuopt/user-guide/latest/introduction.html) [![Docker Hub](https://img.shields.io/badge/docker-nvidia%2Fcuopt-blue?logo=docker)](https://hub.docker.com/r/nvidia/cuopt) [![Examples](https://img.shields.io/badge/examples-cuopt--examples-orange)](https://github.com/NVIDIA/cuopt-examples) @@ -83,7 +83,7 @@ For CUDA 12.x: pip install \ --extra-index-url=https://pypi.nvidia.com \ nvidia-cuda-runtime-cu12==12.9.* \ - cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.* ``` Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages. @@ -91,7 +91,7 @@ Development wheels are available as nightlies, please update `--extra-index-url` pip install --pre \ --extra-index-url=https://pypi.nvidia.com \ --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \ - cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.* ``` For CUDA 13.x: @@ -99,7 +99,7 @@ For CUDA 13.x: ```bash pip install \ --extra-index-url=https://pypi.nvidia.com \ - cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.* ``` Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages. @@ -107,7 +107,7 @@ Development wheels are available as nightlies, please update `--extra-index-url` pip install --pre \ --extra-index-url=https://pypi.nvidia.com \ --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \ - cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.* ``` @@ -118,7 +118,7 @@ cuOpt can be installed with conda (via [miniforge](https://github.com/conda-forg All other dependencies are installed automatically when `cuopt-server` and `cuopt-sh-client` are installed. ```bash -conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.04.* cuopt-sh-client=26.04.* +conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.06.* cuopt-sh-client=26.06.* ``` We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 0bd0e8a95b..cdb610a24d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -26.04.00 +26.06.00 diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index e01e533a65..be7cc707c9 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -117,7 +118,7 @@ void read_single_solution_from_path(const std::string& path, } } -// reads a solution from an input file. The input file needs to be csv formatted +// Reads a solution from an input file. The input file needs to be csv formatted // var_name,val std::vector> read_solution_from_dir(const std::string file_path, const std::string& mps_file_name, @@ -137,6 +138,58 @@ std::vector> read_solution_from_dir(const std::string file_p return initial_solutions; } +struct incumbent_record_t { + double objective; + double work_timestamp; + double wall_time; + cuopt::internals::mip_solution_origin_t origin; +}; + +class incumbent_tracker_t : public cuopt::internals::get_solution_callback_ext_t { + public: + incumbent_tracker_t(std::chrono::high_resolution_clock::time_point start_time) + : start_time_(start_time) + { + } + + void get_solution(void* data, + void* cost, + void* solution_bound, + const cuopt::internals::mip_solution_callback_info_t* info, + void* user_data) override + { + double obj = *static_cast(cost); + double wt = (info != nullptr) ? info->work_timestamp : -1.0; + auto origin = (info != nullptr) ? (cuopt::internals::mip_solution_origin_t)info->origin + : cuopt::internals::mip_solution_origin_t::UNKNOWN; + auto now = std::chrono::high_resolution_clock::now(); + double wall_s = std::chrono::duration(now - start_time_).count(); + records_.push_back({obj, wt, wall_s, (cuopt::internals::mip_solution_origin_t)origin}); + } + + void write_csv(const std::string& path) const + { + std::ofstream f(path); + if (!f.is_open()) { + fprintf(stderr, "Failed to open incumbent CSV: %s\n", path.c_str()); + return; + } + f << "index,objective,work_timestamp,wall_time_s,origin\n"; + for (size_t i = 0; i < records_.size(); ++i) { + auto& r = records_[i]; + f << i << "," << std::setprecision(15) << r.objective << "," << r.work_timestamp << "," + << std::setprecision(6) << r.wall_time << "," + << cuopt::internals::mip_solution_origin_to_string(r.origin) << "\n"; + } + } + + size_t size() const { return records_.size(); } + + private: + std::chrono::high_resolution_clock::time_point start_time_; + std::vector records_; +}; + int run_single_file(std::string file_path, int device, int batch_id, @@ -203,21 +256,40 @@ int run_single_file(std::string file_path, } } } - settings.time_limit = time_limit; - settings.work_limit = work_limit; - settings.heuristics_only = heuristics_only; - settings.num_cpu_threads = num_cpu_threads; - settings.log_to_console = log_to_console; - settings.determinism_mode = deterministic ? CUOPT_MODE_DETERMINISTIC : CUOPT_MODE_OPPORTUNISTIC; + settings.time_limit = time_limit; + settings.work_limit = work_limit; + settings.heuristics_only = heuristics_only; + settings.num_cpu_threads = num_cpu_threads; + settings.log_to_console = log_to_console; + if (deterministic) { + settings.determinism_mode = + heuristics_only ? CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS : CUOPT_MODE_DETERMINISTIC; + } else { + settings.determinism_mode = CUOPT_MODE_OPPORTUNISTIC; + } + CUOPT_LOG_INFO( + "run_mip settings: heuristics_only=%d deterministic=%d determinism_mode=%d " + "time_limit=%.6f work_limit=%.6f", + (int)heuristics_only, + (int)deterministic, + settings.determinism_mode, + settings.time_limit, + settings.work_limit); settings.tolerances.relative_tolerance = 1e-12; settings.tolerances.absolute_tolerance = 1e-6; settings.presolver = cuopt::linear_programming::presolver_t::Default; settings.reliability_branching = reliability_branching; settings.clique_cuts = -1; settings.seed = 42; + settings.bb_work_unit_scale = 1.0; + settings.gpu_heur_work_unit_scale = 1.0; + settings.mip_scaling = false; + settings.gpu_heur_wait_for_exploration = false; cuopt::linear_programming::benchmark_info_t benchmark_info; settings.benchmark_info_ptr = &benchmark_info; auto start_run_solver = std::chrono::high_resolution_clock::now(); + incumbent_tracker_t incumbent_tracker(start_run_solver); + settings.set_mip_callback(&incumbent_tracker); auto solution = cuopt::linear_programming::solve_mip(&handle_, mps_data_model, settings); CUOPT_LOG_INFO( "first obj: %f last improvement of best feasible: %f last improvement after recombination: %f", @@ -253,7 +325,13 @@ int run_single_file(std::string file_path, << benchmark_info.last_improvement_after_recombination << "," << mip_gap << "," << is_optimal << "\n"; write_to_output_file(out_dir, base_filename, device, n_gpus, batch_id, ss.str()); - CUOPT_LOG_INFO("Results written to the file %s", base_filename.c_str()); + if (!out_dir.empty()) { + std::string mps_stem = base_filename.substr(0, base_filename.find(".mps")); + std::string csv_path = out_dir + "/" + mps_stem + "_incumbents.csv"; + incumbent_tracker.write_csv(csv_path); + CUOPT_LOG_INFO( + "Incumbent trace (%zu entries) written to %s", incumbent_tracker.size(), csv_path.c_str()); + } return sol_found; } diff --git a/ci/compute-sanitizer-suppressions.xml b/ci/compute-sanitizer-suppressions.xml new file mode 100644 index 0000000000..624b3aa0bd --- /dev/null +++ b/ci/compute-sanitizer-suppressions.xml @@ -0,0 +1,249 @@ + + + + Initcheck + + Uninitialized __global__ memory read of size 4 bytes + 4 + + + .* + + + + .*libcuda.so.* + + + cusparseCsr2cscEx2 + .*libcusparse.so.* + + + + + Initcheck + + Uninitialized __global__ memory read of size 4 bytes + 4 + + + ThreadLoad + + + + .*libcuda.so.* + + + libcudart.* + + + cudaLaunchKernel + + + .*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.* + + + + + Initcheck + + Uninitialized __global__ memory read of size 2 bytes + 2 + + + ThreadLoad + + + + .*libcuda.so.* + + + libcudart.* + + + cudaLaunchKernel + + + .*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.* + + + + + Initcheck + + Uninitialized __global__ memory read of size 8 bytes + 8 + + + DeviceSegmentedReduceKernel + + + + Initcheck + + Uninitialized __global__ memory read of size 4 bytes + 4 + + + ThreadLoad + + + + .*libcuda.so.* + + + libcudart.* + + + libcudart.* + + + .*libcuopt.* + + + .*Device(Reduce|Scan)Kernel.* + + + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + 16 + + + + cuMemcpyDtoHAsync.* + .*libcuda.so.* + + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyAsync + .*libcuda.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*librmm.so.* + + + rmm::device_buffer::device_buffer + .*librmm.so.* + + + + + + Initcheck + + Uninitialized __global__ memory read + + + transform_kernel + + + + cuLaunchKernel_ptsz + .*libcuda.so.* + + + .*libcudart.so.* + + + cudaLaunchKernel_ptsz + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyAsync + .*libcuda.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*librmm.so.* + + + .*librmm.so.* + + + rmm::device_uvector.*::device_uvector + .*libcuopt.so.* + + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyDtoDAsync.* + .*libcuda.so.* + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyAsync + .*libcuda.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + cudaMemcpyAsync + + + rmm::device_buffer::resize + .*librmm.so.* + + + + diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 3d6c356b3d..9a67bb65a5 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -152,3 +152,6 @@ elif [[ "${RUN_CONTEXT}" == "release" ]]; then sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/faq.rst sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/cuopt-python/routing/routing-example.ipynb fi + +# Update docs version switcher to include the new version +python ci/utils/update_doc_versions.py diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index cf3563d476..04dc6bb83c 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=12.9.2,<13.0 - cuda-sanitizer-api - cuda-version=12.9 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index a8a589e48b..21891cc9f2 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=12.9.2,<13.0 - cuda-sanitizer-api - cuda-version=12.9 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 477c708918..89147b18a7 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=13.0.1,<14.0 - cuda-sanitizer-api - cuda-version=13.1 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index d5fcba0b73..8df6f28bf7 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=13.0.1,<14.0 - cuda-sanitizer-api - cuda-version=13.1 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9249b53171..06523bf404 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -652,11 +652,14 @@ rapids_cpm_find( if(NOT BUILD_LP_ONLY) add_executable(cuopt_cli cuopt_cli.cpp) +# PIE executable: auditwheel/patchelf expands .dynstr/RPATH when repairing wheels; non-PIE +# (ET_EXEC) binaries are prone to corrupt segment layout. PIE (ET_DYN) survives RPATH edits. set_target_properties(cuopt_cli PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON CXX_SCAN_FOR_MODULES OFF + POSITION_INDEPENDENT_CODE ON ) target_compile_options(cuopt_cli @@ -664,6 +667,8 @@ target_compile_options(cuopt_cli "$<$:${CUOPT_CUDA_FLAGS}>" ) +target_link_options(cuopt_cli PRIVATE -pie) + target_include_directories(cuopt_cli PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 06eacb3408..813c6f0cf4 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -104,10 +104,39 @@ #define CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT "mip_hyper_heuristic_relaxed_lp_time_limit" #define CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT \ "mip_hyper_heuristic_related_vars_time_limit" - -/* @brief MIP determinism mode constants */ -#define CUOPT_MODE_OPPORTUNISTIC 0 -#define CUOPT_MODE_DETERMINISTIC 1 +#define CUOPT_MIP_HYPER_HEURISTIC_CPUFJ_WORK_UNIT_SCALE "mip_hyper_heuristic_cpufj_work_unit_scale" +#define CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WORK_UNIT_SCALE \ + "mip_hyper_heuristic_gpu_heur_work_unit_scale" +#define CUOPT_MIP_HYPER_HEURISTIC_BB_WORK_UNIT_SCALE "mip_hyper_heuristic_bb_work_unit_scale" +#define CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WAIT_FOR_EXPLORATION \ + "mip_hyper_heuristic_gpu_heur_wait_for_exploration" + +/* @brief MIP determinism mode flags (bitset) */ +#define CUOPT_DETERMINISM_NONE 0x0 +// matches the previous value of '1' which was for B&B-only determinism in the previous release +#define CUOPT_DETERMINISM_BB 0x1 +#define CUOPT_DETERMINISM_GPU_HEURISTICS 0x2 +#define CUOPT_DETERMINISM_FULL (CUOPT_DETERMINISM_BB | CUOPT_DETERMINISM_GPU_HEURISTICS) + +#define CUOPT_MODE_OPPORTUNISTIC CUOPT_DETERMINISM_NONE +#define CUOPT_MODE_DETERMINISTIC CUOPT_DETERMINISM_FULL +#define CUOPT_MODE_DETERMINISTIC_BB CUOPT_DETERMINISM_BB +#define CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS CUOPT_DETERMINISM_GPU_HEURISTICS + +/* @brief MIP solution origin constants */ +#define CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN 0 +#define CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND 1 +#define CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND_DIVING 2 +#define CUOPT_MIP_SOLUTION_ORIGIN_FEASIBILITY_JUMP 3 +#define CUOPT_MIP_SOLUTION_ORIGIN_CPU_FEASIBILITY_JUMP 4 +#define CUOPT_MIP_SOLUTION_ORIGIN_LOCAL_SEARCH 5 +#define CUOPT_MIP_SOLUTION_ORIGIN_LP_ROUNDING 6 +#define CUOPT_MIP_SOLUTION_ORIGIN_RECOMBINATION 7 +#define CUOPT_MIP_SOLUTION_ORIGIN_SUB_MIP 8 +#define CUOPT_MIP_SOLUTION_ORIGIN_USER_INITIAL 9 +#define CUOPT_MIP_SOLUTION_ORIGIN_USER_INJECTED 10 +#define CUOPT_MIP_SOLUTION_ORIGIN_RINS 11 +#define CUOPT_MIP_SOLUTION_ORIGIN_PRESOLVE 12 /* @brief LP/MIP termination status constants */ #define CUOPT_TERMINATION_STATUS_NO_TERMINATION 0 diff --git a/cpp/include/cuopt/linear_programming/cuopt_c.h b/cpp/include/cuopt/linear_programming/cuopt_c.h index 4c4d44c764..f72a00e932 100644 --- a/cpp/include/cuopt/linear_programming/cuopt_c.h +++ b/cpp/include/cuopt/linear_programming/cuopt_c.h @@ -71,6 +71,23 @@ typedef int32_t cuopt_int_t; typedef int64_t cuopt_int_t; #endif +/** + * @brief Extended callback information passed to cuOptMIPGetSolutionCallbackExt. + * + * Provides metadata about each incumbent solution reported during a MIP solve. + * + * Fields are append-only. Existing fields will never be reordered, removed, + * or change type across releases. + */ +typedef struct { + /** Which solver component found this solution (CUOPT_MIP_SOLUTION_ORIGIN_*). */ + uint32_t origin; + /** Deterministic work-unit timestamp at which the solution was found. + * Monotonically increasing across successive callbacks within a single solve. + * In non-deterministic mode this value is informational only. */ + double work_timestamp; +} cuOptMIPSolutionCallbackInfo; + /** * @brief Get the size of the float type. * @@ -713,6 +730,24 @@ typedef void (*cuOptMIPGetSolutionCallback)(const cuopt_float_t* solution, const cuopt_float_t* solution_bound, void* user_data); +/** + * @brief Type of callback for receiving incumbent MIP solutions with extended metadata. + * + * @param[in] solution - Pointer to incumbent solution values. + * @param[in] objective_value - Pointer to incumbent objective value. + * @param[in] solution_bound - Pointer to current solution (dual/user) bound. + * @param[in] callback_info - Pointer to callback metadata. + * @param[in] user_data - Pointer to user data. + * @note All pointer arguments refer to host memory and are only valid during the callback + * invocation. Do not pass device/GPU pointers. Copy any data you need to keep after the callback + * returns. + */ +typedef void (*cuOptMIPGetSolutionCallbackExt)(const cuopt_float_t* solution, + const cuopt_float_t* objective_value, + const cuopt_float_t* solution_bound, + const cuOptMIPSolutionCallbackInfo* callback_info, + void* user_data); + /** * @brief Type of callback for injecting MIP solutions with user context. * @@ -748,6 +783,19 @@ cuopt_int_t cuOptSetMIPGetSolutionCallback(cuOptSolverSettings settings, cuOptMIPGetSolutionCallback callback, void* user_data); +/** + * @brief Register an extended callback to receive incumbent MIP solutions with extended metadata. + * + * @param[in] settings - The solver settings object. + * @param[in] callback - Callback function to receive incumbent solutions and callback metadata. + * @param[in] user_data - User-defined pointer passed through to the callback. + * + * @return A status code indicating success or failure. + */ +cuopt_int_t cuOptSetMIPGetSolutionCallbackExt(cuOptSolverSettings settings, + cuOptMIPGetSolutionCallbackExt callback, + void* user_data); + /** * @brief Register a callback to inject MIP solutions. * diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 14c4d227bc..77425276c3 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -107,6 +107,13 @@ class mip_solver_settings_t { i_t strong_branching_simplex_iteration_limit = -1; i_t num_gpus = 1; bool log_to_console = true; + // User-facing multipliers on top of internal baseline work-unit scales. + // 1.0 = use internally calibrated default. Values > 1 make that component appear to do more work. + f_t cpufj_work_unit_scale = 1.0; + f_t gpu_heur_work_unit_scale = 1.0; + f_t bb_work_unit_scale = 1.0; + // When true, GPU heuristics wait for B&B to finish root solve before starting. + bool gpu_heur_wait_for_exploration = false; std::string log_file; std::string sol_file; @@ -118,15 +125,15 @@ class mip_solver_settings_t { int mip_scaling = CUOPT_MIP_SCALING_NO_OBJECTIVE; presolver_t presolver{presolver_t::Default}; /** - * @brief Determinism mode for MIP solver. + * @brief Determinism mode for MIP solver (bitset). * - * Controls the determinism behavior of the MIP solver: - * - CUOPT_MODE_OPPORTUNISTIC (0): Default mode, allows non-deterministic - * parallelism for better performance - * - CUOPT_MODE_DETERMINISTIC (1): Ensures deterministic results across runs - * at potential cost of performance + * Bitwise OR of CUOPT_DETERMINISM_* flags: + * - CUOPT_DETERMINISM_NONE (0x0): Opportunistic, non-deterministic. + * - CUOPT_DETERMINISM_BB (0x1): Deterministic B&B tree exploration. + * - CUOPT_DETERMINISM_GPU_HEURISTICS (0x2): Deterministic GPU heuristic pipeline. + * - CUOPT_DETERMINISM_FULL (0x3): Both B&B and GPU heuristics deterministic. */ - int determinism_mode = CUOPT_MODE_OPPORTUNISTIC; + int determinism_mode = CUOPT_DETERMINISM_NONE; /** * @brief Random seed for the MIP solver. * diff --git a/cpp/include/cuopt/linear_programming/utilities/internals.hpp b/cpp/include/cuopt/linear_programming/utilities/internals.hpp index bdfbb969d2..6fad66b5f5 100644 --- a/cpp/include/cuopt/linear_programming/utilities/internals.hpp +++ b/cpp/include/cuopt/linear_programming/utilities/internals.hpp @@ -13,6 +13,8 @@ #include #include +#include + namespace cuopt { namespace internals { @@ -21,7 +23,49 @@ class Callback { virtual ~Callback() {} }; -enum class base_solution_callback_type { GET_SOLUTION, SET_SOLUTION }; +enum class mip_solution_origin_t : uint32_t { + UNKNOWN = CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN, + BRANCH_AND_BOUND_NODE = CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND, + BRANCH_AND_BOUND_DIVING = CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND_DIVING, + FEASIBILITY_JUMP = CUOPT_MIP_SOLUTION_ORIGIN_FEASIBILITY_JUMP, + CPU_FEASIBILITY_JUMP = CUOPT_MIP_SOLUTION_ORIGIN_CPU_FEASIBILITY_JUMP, + LOCAL_SEARCH = CUOPT_MIP_SOLUTION_ORIGIN_LOCAL_SEARCH, + LP_ROUNDING = CUOPT_MIP_SOLUTION_ORIGIN_LP_ROUNDING, + RECOMBINATION = CUOPT_MIP_SOLUTION_ORIGIN_RECOMBINATION, + SUB_MIP = CUOPT_MIP_SOLUTION_ORIGIN_SUB_MIP, + USER_INITIAL = CUOPT_MIP_SOLUTION_ORIGIN_USER_INITIAL, + USER_INJECTED = CUOPT_MIP_SOLUTION_ORIGIN_USER_INJECTED, + RINS = CUOPT_MIP_SOLUTION_ORIGIN_RINS, + PRESOLVE = CUOPT_MIP_SOLUTION_ORIGIN_PRESOLVE, +}; + +constexpr const char* mip_solution_origin_to_string(mip_solution_origin_t origin) +{ + switch (origin) { + case mip_solution_origin_t::UNKNOWN: return "unknown"; + case mip_solution_origin_t::BRANCH_AND_BOUND_NODE: return "branch_and_bound_node"; + case mip_solution_origin_t::BRANCH_AND_BOUND_DIVING: return "branch_and_bound_diving"; + case mip_solution_origin_t::FEASIBILITY_JUMP: return "feasibility_jump"; + case mip_solution_origin_t::CPU_FEASIBILITY_JUMP: return "cpu_feasibility_jump"; + case mip_solution_origin_t::LOCAL_SEARCH: return "local_search"; + case mip_solution_origin_t::LP_ROUNDING: return "lp_rounding"; + case mip_solution_origin_t::RECOMBINATION: return "recombination"; + case mip_solution_origin_t::SUB_MIP: return "sub_mip"; + case mip_solution_origin_t::USER_INITIAL: return "user_initial"; + case mip_solution_origin_t::USER_INJECTED: return "user_injected"; + case mip_solution_origin_t::RINS: return "rins"; + case mip_solution_origin_t::PRESOLVE: + return "presolve"; + // no default to trigger compiler -Werror + } + return "unknown"; +} + +using mip_solution_callback_info_t = cuOptMIPSolutionCallbackInfo; + +// get_solution_ext was added to support passing additional information to the get_solution callback +// without inducing a breaking ABI change +enum class base_solution_callback_type { GET_SOLUTION, GET_SOLUTION_EXT, SET_SOLUTION }; class base_solution_callback_t : public Callback { public: @@ -55,6 +99,19 @@ class get_solution_callback_t : public base_solution_callback_t { } }; +class get_solution_callback_ext_t : public base_solution_callback_t { + public: + virtual void get_solution(void* data, + void* objective_value, + void* solution_bound, + const mip_solution_callback_info_t* callback_info, + void* user_data) = 0; + base_solution_callback_type get_type() const override + { + return base_solution_callback_type::GET_SOLUTION_EXT; + } +}; + class set_solution_callback_t : public base_solution_callback_t { public: virtual void set_solution(void* data, diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 4da66abe77..902e691e64 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -40,7 +40,9 @@ #include #include +#include #include +#include namespace cuopt::linear_programming::dual_simplex { diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp index d37760cd07..69e72d66bc 100644 --- a/cpp/src/barrier/iterative_refinement.hpp +++ b/cpp/src/barrier/iterative_refinement.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 33a2d983c9..153f6e0def 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -5,9 +5,12 @@ */ /* clang-format on */ +#include + #include #include #include +#include #include #include @@ -25,6 +28,7 @@ #include #include +#include #include @@ -35,13 +39,20 @@ #include #include #include -#include #include #include #include -#include #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(logger, ...) \ + do { \ + logger.printf(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::dual_simplex { namespace { @@ -270,6 +281,22 @@ branch_and_bound_t::branch_and_bound_t( dualize_info_t dualize_info; convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info); full_variable_types(original_problem_, original_lp_, var_types_); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic LP init state: rows=%d cols=%d nnz=%zu slacks=%zu slack_hash=0x%x " + "rhs_hash=0x%x lower_hash=0x%x upper_hash=0x%x Acol_hash=0x%x Arow_hash=0x%x " + "Aval_hash=0x%x\n", + original_lp_.num_rows, + original_lp_.num_cols, + original_lp_.A.x.size(), + new_slacks_.size(), + detail::compute_hash(new_slacks_), + detail::compute_hash(original_lp_.rhs), + detail::compute_hash(original_lp_.lower), + detail::compute_hash(original_lp_.upper), + detail::compute_hash(original_lp_.A.col_start), + detail::compute_hash(original_lp_.A.i), + detail::compute_hash(original_lp_.A.x)); // Check slack #ifdef CHECK_SLACKS @@ -320,19 +347,30 @@ void branch_and_bound_t::set_initial_upper_bound(f_t bound) } template -void branch_and_bound_t::report_heuristic(f_t obj) +void branch_and_bound_t::report_heuristic(f_t obj, double work_time) { if (is_running_) { f_t user_obj = compute_user_objective(original_lp_, obj); f_t user_lower = compute_user_objective(original_lp_, get_lower_bound()); std::string user_gap = user_mip_gap(original_lp_, obj, get_lower_bound()); - - settings_.log.printf( - "H %+13.6e %+10.6e %s %9.2f\n", - user_obj, - user_lower, - user_gap.c_str(), - toc(exploration_stats_.start_time)); + if (settings_.deterministic) { + const double reported_work = work_time >= 0.0 ? work_time : work_unit_context_.current_work(); + settings_.log.printf( + "H %+13.6e %+10.6e %s " + "%9.2f %9.2f\n", + user_obj, + user_lower, + user_gap.c_str(), + reported_work, + toc(exploration_stats_.start_time)); + } else { + settings_.log.printf( + "H %+13.6e %+10.6e %s %9.2f\n", + user_obj, + user_lower, + user_gap.c_str(), + toc(exploration_stats_.start_time)); + } } else { if (solving_root_relaxation_.load()) { f_t user_obj = compute_user_objective(original_lp_, obj); @@ -461,8 +499,11 @@ void branch_and_bound_t::update_user_bound(f_t lower_bound) } template -void branch_and_bound_t::set_new_solution(const std::vector& solution) +void branch_and_bound_t::set_new_solution(const std::vector& solution, + cuopt::internals::mip_solution_origin_t origin) { + cuopt_assert(!settings_.deterministic, "set_new_solution is for opportunistic B&B only"); + mutex_original_lp_.lock(); if (solution.size() != original_problem_.num_cols) { settings_.log.printf( @@ -513,51 +554,91 @@ void branch_and_bound_t::set_new_solution(const std::vector& solu if (is_feasible) { report_heuristic(obj); } if (attempt_repair) { mutex_repair_.lock(); - repair_queue_.push_back(solution); + repair_queue_.push_back({solution, origin}); mutex_repair_.unlock(); } } template -void branch_and_bound_t::queue_external_solution_deterministic( - const std::vector& solution, double work_unit_ts) +void branch_and_bound_t::emit_solution_callback( + std::vector& original_x, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp) +{ + cuopt_assert(!settings_.deterministic || work_timestamp >= 0.0, + "work_timestamp must not be negative in deterministic mode"); + if (settings_.new_incumbent_callback != nullptr) { + settings_.log.debug("Publishing incumbent: obj=%g wut=%.6f origin=%s\n", + compute_user_objective(original_lp_, objective), + work_timestamp, + cuopt::internals::mip_solution_origin_to_string(origin)); + cuopt::internals::mip_solution_callback_info_t callback_info{}; + callback_info.origin = (uint32_t)origin; + callback_info.work_timestamp = work_timestamp; + settings_.new_incumbent_callback(original_x, objective, callback_info, work_timestamp); + } +} + +template +void branch_and_bound_t::emit_solution_callback_from_crushed( + const std::vector& crushed_solution, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp) { - // In deterministic mode, queue the solution to be processed at the correct work unit timestamp - // This ensures deterministic ordering of solution events + if (settings_.new_incumbent_callback == nullptr) { return; } + std::vector original_x; + uncrush_primal_solution(original_problem_, original_lp_, crushed_solution, original_x); + emit_solution_callback(original_x, objective, origin, work_timestamp); +} +template +void branch_and_bound_t::queue_external_solution_deterministic( + const std::vector& solution, + f_t user_objective, + double work_unit_ts, + cuopt::internals::mip_solution_origin_t origin) +{ if (solution.size() != original_problem_.num_cols) { settings_.log.printf( "Solution size mismatch %ld %d\n", solution.size(), original_problem_.num_cols); return; } + settings_.log.printf( + "Queueing deterministic external incumbent: obj=%g heur_wut=%.3f bnb_wut=%.3f origin=%s " + "hash=0x%x\n", + user_objective, + work_unit_ts, + work_unit_context_.current_work(), + cuopt::internals::mip_solution_origin_to_string(origin), + detail::compute_hash(solution)); mutex_original_lp_.lock(); - std::vector crushed_solution; - crush_primal_solution( - original_problem_, original_lp_, solution, new_slacks_, crushed_solution); - f_t obj = compute_objective(original_lp_, crushed_solution); - - // Validate solution before queueing - f_t primal_err; - f_t bound_err; - i_t num_fractional; - bool is_feasible = check_guess( - original_lp_, settings_, var_types_, crushed_solution, primal_err, bound_err, num_fractional); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic external crush ctx: wut=%.6f lp_rows=%d lp_cols=%d lp_nnz=%zu " + "active_cut_rows=%d " + "slacks=%zu slack_hash=0x%x rhs_hash=0x%x lower_hash=0x%x upper_hash=0x%x " + "Acol_hash=0x%x Arow_hash=0x%x Aval_hash=0x%x\n", + work_unit_ts, + original_lp_.num_rows, + original_lp_.num_cols, + original_lp_.A.x.size(), + std::max((i_t)0, original_lp_.num_rows - original_problem_.num_rows), + new_slacks_.size(), + detail::compute_hash(new_slacks_), + detail::compute_hash(original_lp_.rhs), + detail::compute_hash(original_lp_.lower), + detail::compute_hash(original_lp_.upper), + detail::compute_hash(original_lp_.A.col_start), + detail::compute_hash(original_lp_.A.i), + detail::compute_hash(original_lp_.A.x)); mutex_original_lp_.unlock(); - if (!is_feasible) { - // Queue the uncrushed solution for repair; it will be crushed at - // consumption time so that the crush reflects the current LP state - // (which may have gained slack columns from cuts added after this point). - mutex_repair_.lock(); - repair_queue_.push_back(solution); - mutex_repair_.unlock(); - return; - } - - // Queue the solution with its work unit timestamp mutex_heuristic_queue_.lock(); - heuristic_solution_queue_.push_back({obj, std::move(crushed_solution), 0, -1, 0, work_unit_ts}); + heuristic_solution_queue_.push_back({solution, user_objective, work_unit_ts, origin}); + const size_t heuristic_queue_size = heuristic_solution_queue_.size(); mutex_heuristic_queue_.unlock(); } @@ -620,6 +701,14 @@ bool branch_and_bound_t::repair_solution(const std::vector& edge_ num_fractional, repaired_obj); } + } else { + settings_.log.printf( + "Repair LP failed: status=%s iters=%d time=%.3fs time_limit=%.3f cut_off=%e\n", + dual::status_to_string(lp_status).c_str(), + iter, + toc(lp_start_time), + lp_settings.time_limit, + lp_settings.cut_off); } return feasible; @@ -630,7 +719,7 @@ void branch_and_bound_t::repair_heuristic_solutions() { raft::common::nvtx::range scope("BB::repair_heuristics"); // Check if there are any solutions to repair - std::vector> to_repair; + std::vector to_repair; mutex_repair_.lock(); if (repair_queue_.size() > 0) { to_repair = repair_queue_; @@ -640,7 +729,8 @@ void branch_and_bound_t::repair_heuristic_solutions() if (to_repair.size() > 0) { settings_.log.debug("Attempting to repair %ld injected solutions\n", to_repair.size()); - for (const std::vector& uncrushed_solution : to_repair) { + for (const auto& queued_solution : to_repair) { + const std::vector& uncrushed_solution = queued_solution.solution; std::vector crushed_solution; crush_primal_solution( original_problem_, original_lp_, uncrushed_solution, new_slacks_, crushed_solution); @@ -652,15 +742,23 @@ void branch_and_bound_t::repair_heuristic_solutions() mutex_upper_.lock(); if (improves_incumbent(repaired_obj)) { - upper_bound_ = std::min(upper_bound_.load(), repaired_obj); + const f_t previous_upper = upper_bound_; + upper_bound_ = std::min(upper_bound_.load(), repaired_obj); incumbent_.set_incumbent_solution(repaired_obj, repaired_solution); - report_heuristic(repaired_obj); - - if (settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, repaired_solution, original_x); - settings_.solution_callback(original_x, repaired_obj); - } + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=repair_queue prev_upper=%.16e " + "new_upper=%.16e obj=%.16e hash=0x%x\n", + previous_upper, + upper_bound_.load(), + repaired_obj, + detail::compute_hash(repaired_solution)); + report_heuristic(repaired_obj, queued_solution.work_timestamp); + + emit_solution_callback_from_crushed(repaired_solution, + repaired_obj, + queued_solution.origin, + queued_solution.work_timestamp); } mutex_upper_.unlock(); @@ -690,14 +788,47 @@ void branch_and_bound_t::set_solution_at_root(mip_solution_t compute_user_objective(original_lp_, root_objective_), toc(exploration_stats_.start_time)); - if (settings_.solution_callback != nullptr) { - settings_.solution_callback(solution.x, solution.objective); - } + emit_solution_callback(solution.x, + solution.objective, + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE, + work_unit_context_.current_work()); if (settings_.heuristic_preemption_callback != nullptr) { settings_.heuristic_preemption_callback(); } } +template +std::tuple> branch_and_bound_t::retire_queued_solution( + const queued_external_solution_t& queued_solution) +{ + f_t primal_err; + f_t bound_err; + i_t num_fractional; + std::vector crushed; + + mutex_original_lp_.lock(); + crush_primal_solution( + original_problem_, original_lp_, queued_solution.solution, new_slacks_, crushed); + f_t obj = compute_objective(original_lp_, crushed); + bool is_feasible = check_guess( + original_lp_, settings_, var_types_, crushed, primal_err, bound_err, num_fractional); + mutex_original_lp_.unlock(); + + if (is_feasible) { return {true, obj, std::move(crushed)}; } + + // Attempt repair immediately, no separate repair queue in deterministic mode + std::vector repaired_solution; + f_t repaired_obj; + bool repaired = repair_solution(edge_norms_, crushed, repaired_obj, repaired_solution); + if (repaired) { return {true, repaired_obj, std::move(repaired_solution)}; } + + CUOPT_DETERMINISM_LOG(settings_.log, + "Deterministic repair FAILED: wut=%.3f origin=%s\n", + queued_solution.work_timestamp, + cuopt::internals::mip_solution_origin_to_string(queued_solution.origin)); + return {false, {}, {}}; +} + template void branch_and_bound_t::set_final_solution(mip_solution_t& solution, f_t lower_bound) @@ -767,6 +898,53 @@ void branch_and_bound_t::set_final_solution(mip_solution_t& } } + // Drain any pending heuristic solutions that B&B never got to retire during exploration + // (e.g., root solve consumed the entire budget). + if (settings_.deterministic) { + const double current_work = work_unit_context_.current_work(); + mutex_heuristic_queue_.lock(); + std::vector pending; + pending.swap(heuristic_solution_queue_); + mutex_heuristic_queue_.unlock(); + + std::sort(pending.begin(), + pending.end(), + [](const queued_external_solution_t& a, const queued_external_solution_t& b) { + if (a.work_timestamp != b.work_timestamp) { + return a.work_timestamp < b.work_timestamp; + } + if (a.user_objective != b.user_objective) { + return a.user_objective < b.user_objective; + } + if (a.origin != b.origin) { return a.origin < b.origin; } + return a.solution < b.solution; + }); + + for (const auto& queued_solution : pending) { + if (queued_solution.work_timestamp > current_work) { continue; } + auto [feasible, obj, crushed] = retire_queued_solution(queued_solution); + if (feasible && improves_incumbent(obj)) { + upper_bound_ = std::min(upper_bound_.load(), obj); + incumbent_.set_incumbent_solution(obj, crushed); + settings_.log.debug( + "Late-retired heuristic incumbent: obj=%.6e wut=%.3f origin=%s\n", + compute_user_objective(original_lp_, obj), + queued_solution.work_timestamp, + cuopt::internals::mip_solution_origin_to_string(queued_solution.origin)); + emit_solution_callback_from_crushed( + crushed, obj, queued_solution.origin, queued_solution.work_timestamp); + } + } + size_t n_drained = pending.size(); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Post-drain: user_upper=%.16e has_incumbent=%d drained=%zu user_lower_arg=%.16e\n", + compute_user_objective(original_lp_, upper_bound_.load()), + (int)incumbent_.has_incumbent, + n_drained, + compute_user_objective(original_lp_, lower_bound)); + } + if (has_solver_space_incumbent()) { uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x); solution.objective = incumbent_.objective; @@ -790,16 +968,29 @@ void branch_and_bound_t::add_feasible_solution(f_t leaf_objective, mutex_upper_.lock(); if (improves_incumbent(leaf_objective)) { + const f_t previous_upper = upper_bound_; incumbent_.set_incumbent_solution(leaf_objective, leaf_solution); upper_bound_ = std::min(upper_bound_.load(), leaf_objective); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=leaf prev_upper=%.16e new_upper=%.16e " + "obj=%.16e hash=0x%x depth=%d worker_type=%d\n", + previous_upper, + upper_bound_.load(), + leaf_objective, + detail::compute_hash(leaf_solution), + leaf_depth, + (int)thread_type); report(feasible_solution_symbol(thread_type), leaf_objective, get_lower_bound(), leaf_depth, 0); send_solution = true; } - if (send_solution && settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, original_x); - settings_.solution_callback(original_x, leaf_objective); + if (send_solution) { + emit_solution_callback_from_crushed( + incumbent_.x, + leaf_objective, + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE, + work_unit_context_.current_work()); } mutex_upper_.unlock(); } @@ -936,6 +1127,23 @@ struct nondeterministic_policy_t : tree_update_policy_t { f_t obj, const std::vector& x) override { + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool cg = check_guess( + bnb.original_lp_, bnb.settings_, bnb.var_types_, x, primal_err, bound_err, num_fractional); + if (!cg) { + bnb.settings_.log.printf( + "Rejecting infeasible integer solution: node=%d depth=%d " + "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n", + node->node_id, + node->depth, + obj, + primal_err, + bound_err, + num_fractional); + return; + } bnb.add_feasible_solution(obj, x, node->depth, worker->search_strategy); } @@ -1008,8 +1216,11 @@ struct deterministic_policy_base_t : tree_update_policy_t { ? node->fractional_val - std::floor(node->fractional_val) : std::ceil(node->fractional_val) - node->fractional_val; if (frac > 1e-10) { - worker.pc_snapshot.queue_update( - node->branch_var, node->branch_dir, change / frac, worker.clock, worker.worker_id); + worker.pc_snapshot.queue_update(node->branch_var, + node->branch_dir, + change / frac, + worker.work_context.current_work(), + worker.worker_id); } } @@ -1029,17 +1240,94 @@ struct deterministic_bfs_policy_t const std::vector& x) override { if (obj < this->worker.local_upper_bound) { + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool cg = check_guess(this->bnb.original_lp_, + this->bnb.settings_, + this->bnb.var_types_, + x, + primal_err, + bound_err, + num_fractional); + if (!cg) { + this->bnb.settings_.log.printf( + "Rejecting infeasible integer solution: worker=%d node=%d depth=%d " + "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n", + this->worker.worker_id, + node->creation_seq, + node->depth, + obj, + primal_err, + bound_err, + num_fractional); + return; + } this->worker.local_upper_bound = obj; + CUOPT_DETERMINISM_LOG( + bnb.settings_.log, + "BFS integer solution queued: worker=%d clock=%.6f ctx_work=%.6f obj=%.6e depth=%d\n", + this->worker.worker_id, + this->worker.work_context.current_work(), + this->worker.work_context.global_work_units_elapsed, + obj, + node->depth); this->worker.integer_solutions.push_back( - {obj, x, node->depth, this->worker.worker_id, this->worker.next_solution_seq++}); + {obj, + x, + node->depth, + this->worker.worker_id, + this->worker.next_solution_seq++, + this->worker.work_context.current_work(), + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE}); } } - branch_variable_t select_branch_variable(mip_node_t*, + branch_variable_t select_branch_variable(mip_node_t* node, const std::vector& fractional, const std::vector& x) override { - i_t var = this->worker.pc_snapshot.variable_selection(fractional, x); + i_t var; + if (this->bnb.settings_.reliability_branching != 0 && + this->worker.nodes_explored_snapshot > 0) { + auto& snap = this->worker.pc_snapshot; + + sb_update_callback_t on_sb_update = [&]( + i_t j, rounding_direction_t dir, f_t delta) { + snap.record_update( + j, dir, delta, this->worker.work_context.current_work(), this->worker.worker_id); + }; + + var = reliable_variable_selection_core(node, + fractional, + x, + this->bnb.settings_, + this->bnb.var_types_, + this->worker.leaf_problem, + this->worker.leaf_edge_norms, + this->worker.basis_factors, + this->worker.basic_list, + this->worker.nonbasic_list, + snap.sum_down_.data(), + snap.sum_up_.data(), + snap.num_down_.data(), + snap.num_up_.data(), + snap.n_vars(), + snap.strong_branching_lp_iter_, + this->worker.local_upper_bound, + (int64_t)this->worker.total_lp_iters_snapshot, + (int64_t)this->worker.nodes_explored_snapshot, + this->bnb.exploration_stats_.start_time, + this->bnb.pc_.reliability_branching_settings, + 1, + nullptr, + nullptr, + &this->worker.rng, + &this->worker.work_context, + on_sb_update); + } else { + var = this->worker.pc_snapshot.variable_selection(fractional, x); + } auto dir = martin_criteria(x[var], this->bnb.root_relax_soln_.x[var]); return {var, dir}; } @@ -1072,9 +1360,12 @@ struct deterministic_bfs_policy_t this->worker.enqueue_children_for_plunge(node->get_down_child(), node->get_up_child(), dir); break; case node_status_t::NUMERICAL: this->worker.record_numerical(node); break; + case node_status_t::PENDING: this->worker.plunge_stack.push_back(node); break; default: break; } - if (status != node_status_t::HAS_CHILDREN) { this->worker.recompute_bounds_and_basis = true; } + if (status != node_status_t::HAS_CHILDREN && status != node_status_t::PENDING) { + this->worker.recompute_bounds_and_basis = true; + } } void on_numerical_issue(mip_node_t* node) override @@ -1105,6 +1396,31 @@ struct deterministic_diving_policy_t const std::vector& x) override { if (obj < this->worker.local_upper_bound) { + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool cg = check_guess(this->bnb.original_lp_, + this->bnb.settings_, + this->bnb.var_types_, + x, + primal_err, + bound_err, + num_fractional); + if (!cg) { + this->bnb.settings_.log.printf( + "Rejecting infeasible diving integer solution: worker=%d node=%d depth=%d " + "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n", + this->worker.worker_id, + node->creation_seq, + node->depth, + obj, + primal_err, + bound_err, + num_fractional); + return; + } + const f_t previous_local_upper = this->worker.local_upper_bound; + const int previous_seq = this->worker.next_solution_seq; this->worker.local_upper_bound = obj; this->worker.queue_integer_solution(obj, x, node->depth); } @@ -2017,6 +2333,18 @@ template mip_status_t branch_and_bound_t::solve(mip_solution_t& solution) { raft::common::nvtx::range scope("BB::solve"); + auto exploration_signal_guard = cuopt::scope_guard([this]() { + if (!exploration_started_.load()) { + std::lock_guard lock(exploration_started_mutex_); + exploration_started_ = true; + exploration_started_cv_.notify_all(); + } + }); + auto heuristic_preemption_guard = cuopt::scope_guard([this]() { + if (settings_.heuristic_preemption_callback != nullptr) { + settings_.heuristic_preemption_callback(); + } + }); logger_t log; log.log = false; @@ -2028,6 +2356,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut exploration_stats_.nodes_explored = 0; original_lp_.A.to_compressed_row(Arow_); + work_unit_scheduler_t* saved_scheduler = work_unit_context_.scheduler; + if (settings_.deterministic) { + work_unit_context_.deterministic = true; + cuopt_assert(settings_.bb_work_unit_scale > 0.0, "B&B work-unit scale must be positive"); + if (settings_.gpu_heur_wait_for_exploration) { + // Scale=0 during pre-exploration: root LP/cuts/SB don't advance the deterministic timeline. + // GPU heuristics start after exploration, so both timelines begin at 0 together. + work_unit_context_.work_unit_scale = 0.0; + } else { + // GPU heuristics race with B&B pre-exploration, so B&B work must advance normally. + work_unit_context_.work_unit_scale = BB_BASE_WORK_SCALE * settings_.bb_work_unit_scale; + } + + // Detach the scheduler during the serial root/cuts/SB phase. + // record_work_sync_on_horizon still accumulates global_work_units_elapsed, + // but avoids scheduler->on_work_recorded + work_unit_context_.scheduler = nullptr; + } + settings_.log.printf("Reduced cost strengthening enabled: %d\n", settings_.reduced_cost_strengthening); @@ -2047,14 +2394,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut const f_t computed_obj = compute_objective(original_lp_, crushed_guess); mutex_upper_.lock(); incumbent_.set_incumbent_solution(computed_obj, crushed_guess); - upper_bound_ = computed_obj; + upper_bound_ = std::min(upper_bound_.load(), computed_obj); mutex_upper_.unlock(); } } root_relax_soln_.resize(original_lp_.num_rows, original_lp_.num_cols); - if (settings_.clique_cuts != 0 && clique_table_ == nullptr) { + // TODO: ensure clique tables work well w/ determinism + if (settings_.clique_cuts != 0 && clique_table_ == nullptr && !settings_.deterministic) { signal_extend_cliques_.store(false, std::memory_order_release); typename ::cuopt::linear_programming::mip_solver_settings_t::tolerances_t tolerances_for_clique{}; @@ -2104,7 +2452,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut basic_list, nonbasic_list, root_vstatus_, - edge_norms_); + edge_norms_, + &work_unit_context_); } else { settings_.log.printf("\nSolving LP root relaxation in concurrent mode\n"); root_status = solve_root_relaxation(lp_settings, @@ -2118,6 +2467,10 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut solving_root_relaxation_ = false; exploration_stats_.total_lp_iters = root_relax_soln_.iterations; exploration_stats_.total_lp_solve_time = toc(exploration_stats_.start_time); + CUOPT_DETERMINISM_LOG(settings_.log, + "Post-root-LP work: %.16e iters=%d\n", + work_unit_context_.current_work(), + root_relax_soln_.iterations); auto finish_clique_thread = [this]() { if (clique_table_future_.valid()) { @@ -2163,7 +2516,18 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut assert(root_vstatus_.size() == original_lp_.num_cols); set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + { + const f_t previous_root_objective = root_objective_; + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic root objective assign: source=post_root_solve old=%.16e new=%.16e " + "x_hash=0x%x obj_hash=0x%x\n", + previous_root_objective, + root_objective_, + detail::compute_hash(root_relax_soln_.x), + detail::compute_hash(original_lp_.objective)); + } if (settings_.set_simplex_solution_callback != nullptr) { std::vector original_x; @@ -2395,7 +2759,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut nonbasic_list, root_relax_soln_, iter, - edge_norms_); + edge_norms_, + &work_unit_context_); exploration_stats_.total_lp_iters += iter; f_t dual_phase2_time = toc(dual_phase2_start_time); if (dual_phase2_time > 1.0) { @@ -2406,6 +2771,11 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut set_final_solution(solution, root_objective_); return solver_status_; } + if (cut_status == dual::status_t::WORK_LIMIT) { + solver_status_ = mip_status_t::WORK_LIMIT; + set_final_solution(solution, root_objective_); + return solver_status_; + } if (cut_status != dual::status_t::OPTIMAL) { settings_.log.printf("Numerical issue at root node. Resolving from scratch\n"); @@ -2418,12 +2788,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut basic_list, nonbasic_list, root_vstatus_, - edge_norms_); + edge_norms_, + &work_unit_context_); if (scratch_status == lp_status_t::OPTIMAL) { // We recovered cut_status = convert_lp_status_to_dual_status(scratch_status); exploration_stats_.total_lp_iters += root_relax_soln_.iterations; - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + { + const f_t previous_root_objective = root_objective_; + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic root objective assign: source=cut_lp_scratch old=%.16e new=%.16e " + "pass=%d x_hash=0x%x obj_hash=0x%x\n", + previous_root_objective, + root_objective_, + cut_pass, + detail::compute_hash(root_relax_soln_.x), + detail::compute_hash(original_lp_.objective)); + } } else { settings_.log.printf("Cut status %s\n", dual::status_to_string(cut_status).c_str()); #ifdef WRITE_CUT_INFEASIBLE_MPS @@ -2461,9 +2844,18 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional); if (num_fractional == 0) { - upper_bound_ = root_objective_; + const f_t previous_upper = upper_bound_; + upper_bound_ = std::min(upper_bound_.load(), root_objective_); mutex_upper_.lock(); incumbent_.set_incumbent_solution(root_objective_, root_relax_soln_.x); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=root_integral_pass prev_upper=%.16e " + "new_upper=%.16e obj=%.16e hash=0x%x\n", + previous_upper, + upper_bound_.load(), + root_objective_, + detail::compute_hash(root_relax_soln_.x)); mutex_upper_.unlock(); } f_t obj = upper_bound_.load(); @@ -2523,7 +2915,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut basic_list, nonbasic_list, basis_update, - pc_); + pc_, + &work_unit_context_); } if (toc(exploration_stats_.start_time) > settings_.time_limit) { @@ -2605,6 +2998,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); } if (settings_.deterministic) { + pre_exploration_work_ = work_unit_context_.current_work(); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Pre-exploration work breakdown: total=%.16e scale=%.6f deterministic=%d\n", + pre_exploration_work_, + work_unit_context_.work_unit_scale, + (int)work_unit_context_.deterministic); + work_unit_context_.scheduler = saved_scheduler; + work_unit_context_.work_unit_scale = BB_BASE_WORK_SCALE * settings_.bb_work_unit_scale; settings_.log.printf( " | Explored | Unexplored | Objective | Bound | IntInf | Depth | Iter/Node " "| Gap | Work | Time |\n"); @@ -2614,11 +3016,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut "| Gap | Time |\n"); } + // Signal to producers (like GPU heuristics) that pre-exploration work is finished + { + std::lock_guard lock(exploration_started_mutex_); + exploration_started_ = true; + } + exploration_started_cv_.notify_all(); + + int bb_device_id = 0; + RAFT_CUDA_TRY(cudaGetDevice(&bb_device_id)); + if (settings_.deterministic) { run_deterministic_coordinator(Arow_); } else if (settings_.num_threads > 1) { #pragma omp parallel num_threads(settings_.num_threads) { + // Any OMP thread may end up holding the lock during horizon syncs, and thus + // handle publication of solutions to the callback. Uncrush to the original problem requires + // GPU ops so ensure all threads call cudaSetDevice at init + RAFT_CUDA_TRY(cudaSetDevice(bb_device_id)); #pragma omp master run_scheduler(); } @@ -2633,6 +3049,13 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (deterministic_mode_enabled_) { lower_bound = deterministic_compute_lower_bound(); solver_status_ = deterministic_global_termination_status_; + CUOPT_DETERMINISM_LOG( + settings_.log, + "Final lower bound: user_lb=%.16e user_ub=%.16e status=%d has_incumbent=%d\n", + compute_user_objective(original_lp_, lower_bound), + compute_user_objective(original_lp_, upper_bound_.load()), + (int)deterministic_global_termination_status_, + (int)incumbent_.has_incumbent); } else { if (node_queue_.best_first_queue_size() > 0) { // We need to clear the queue and use the info in the search tree for the lower bound @@ -2786,8 +3209,7 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri deterministic_horizon_step_ = 0.50; - // Compute worker counts using the same formula as reliability-branching scheduler - const i_t num_workers = 2 * settings_.num_threads; + const i_t num_workers = settings_.num_threads; std::vector search_strategies = get_search_strategies(settings_.diving_settings); std::array max_num_workers = @@ -2800,7 +3222,7 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri } deterministic_mode_enabled_ = true; - deterministic_current_horizon_ = deterministic_horizon_step_; + deterministic_current_horizon_ = pre_exploration_work_ + deterministic_horizon_step_; deterministic_horizon_number_ = 0; deterministic_global_termination_status_ = mip_status_t::UNSET; @@ -2828,14 +3250,17 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri } } - deterministic_scheduler_ = std::make_unique(deterministic_horizon_step_); + deterministic_scheduler_ = + std::make_unique(deterministic_horizon_step_, pre_exploration_work_); scoped_context_registrations_t context_registrations(*deterministic_scheduler_); for (auto& worker : *deterministic_workers_) { + worker.work_context.set_current_work(pre_exploration_work_, false); context_registrations.add(worker.work_context); } if (deterministic_diving_workers_) { for (auto& worker : *deterministic_diving_workers_) { + worker.work_context.set_current_work(pre_exploration_work_, false); context_registrations.add(worker.work_context); } } @@ -2843,8 +3268,9 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri int actual_diving_workers = deterministic_diving_workers_ ? (int)deterministic_diving_workers_->size() : 0; settings_.log.printf( - "Deterministic Mode: %d BFS workers + %d diving workers, horizon step = %.2f work " - "units\n", + "Deterministic Mode: %d total threads split as %d BFS workers + %d diving workers, " + "horizon step = %.2f work units\n", + num_workers, num_bfs_workers, actual_diving_workers, deterministic_horizon_step_); @@ -2868,9 +3294,12 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri } const int total_thread_count = num_bfs_workers + num_diving_workers; + int coordinator_device_id = 0; + RAFT_CUDA_TRY(cudaGetDevice(&coordinator_device_id)); #pragma omp parallel num_threads(total_thread_count) { + RAFT_CUDA_TRY(cudaSetDevice(coordinator_device_id)); int thread_id = omp_get_thread_num(); if (thread_id < num_bfs_workers) { auto& worker = (*deterministic_workers_)[thread_id]; @@ -2976,11 +3405,17 @@ void branch_and_bound_t::run_deterministic_bfs_loop( bool is_child = (node->parent == worker.last_solved_node); worker.recompute_bounds_and_basis = !is_child; - node_status_t status = solve_node_deterministic(worker, node, search_tree); - worker.last_solved_node = node; + node_status_t status = solve_node_deterministic(worker, node, search_tree); + worker.current_node = nullptr; - worker.current_node = nullptr; - continue; + if (status == node_status_t::PENDING) { + // Global termination limits were hit (TIME_LIMIT/WORK_LIMIT). Node was re-enqueued by + // on_node_completed. Fall through to sync barrier and let the sync callback handle + // termination. + } else { + worker.last_solved_node = node; + continue; + } } // No work - advance to sync point to participate in barrier @@ -2999,30 +3434,46 @@ void branch_and_bound_t::deterministic_sync_callback() double horizon_end = deterministic_current_horizon_; double wait_start = tic(); - producer_sync_.wait_for_producers(horizon_end); + if (!settings_.sub_mip) { producer_sync_.wait_for_producers(horizon_end); } double wait_time = toc(wait_start); total_producer_wait_time_ += wait_time; max_producer_wait_time_ = std::max(max_producer_wait_time_, wait_time); ++producer_wait_count_; - work_unit_context_.global_work_units_elapsed = horizon_end; + work_unit_context_.set_current_work(horizon_end, false); - bb_event_batch_t all_events = deterministic_workers_->collect_and_sort_events(); + { + std::string worker_clocks_str; + for (const auto& w : *deterministic_workers_) { + worker_clocks_str += std::to_string(w.worker_id) + ":" + + std::to_string(w.work_context.current_work()) + "/" + + std::to_string(w.integer_solutions.size()) + " "; + } + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic sync #%d: horizon=%.6f pre_expl=%.6f heur_q=%zu workers=[%s]\n", + deterministic_horizon_number_, + deterministic_current_horizon_, + pre_exploration_work_, + heuristic_solution_queue_.size(), + worker_clocks_str.c_str()); + } - deterministic_sort_replay_events(all_events); + bb_event_batch_t all_events = deterministic_workers_->collect_and_sort_events(); - // deterministic_prune_worker_nodes_vs_incumbent(); + std::vector::deterministic_replay_solution_t> + replay_solutions; + deterministic_collect_worker_solutions( + *deterministic_workers_, + [](const deterministic_bfs_worker_pool_t&, int) { + return search_strategy_t::BEST_FIRST; + }, + replay_solutions); + deterministic_collect_diving_solutions_and_update_pseudocosts(replay_solutions); - deterministic_collect_diving_solutions_and_update_pseudocosts(); + deterministic_sort_replay_events(all_events, replay_solutions); - for (auto& worker : *deterministic_workers_) { - worker.integer_solutions.clear(); - } - if (deterministic_diving_workers_) { - for (auto& worker : *deterministic_diving_workers_) { - worker.integer_solutions.clear(); - } - } + // deterministic_prune_worker_nodes_vs_incumbent(); deterministic_populate_diving_heap(); @@ -3079,6 +3530,19 @@ void branch_and_bound_t::deterministic_sync_callback() f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound, lower_bound); f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Sync termination check: horizon=%.6f user_lower=%.16e user_upper=%.16e abs_gap=%.6e " + "rel_gap=%.6e bfs_has_work=%d diving_has_work=%d status=%d\n", + deterministic_current_horizon_, + compute_user_objective(original_lp_, lower_bound), + compute_user_objective(original_lp_, upper_bound), + abs_gap, + rel_gap, + (int)deterministic_workers_->any_has_work(), + deterministic_diving_workers_ ? (int)deterministic_diving_workers_->any_has_work() : -1, + (int)deterministic_global_termination_status_); + if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { deterministic_global_termination_status_ = mip_status_t::OPTIMAL; } @@ -3167,7 +3631,12 @@ node_status_t branch_and_bound_t::solve_node_deterministic( simplex_solver_settings_t lp_settings = settings_; lp_settings.set_log(false); - lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + if (original_lp_.objective_is_integral) { + lp_settings.cut_off = + std::ceil(worker.local_upper_bound - settings_.integer_tol) + settings_.dual_tol; + } else { + lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + } lp_settings.inside_mip = 2; lp_settings.time_limit = remaining_time; lp_settings.scale_columns = false; @@ -3199,7 +3668,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( std::vector& leaf_vstatus = node_ptr->vstatus; i_t node_iter = 0; f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; + worker.leaf_edge_norms = edge_norms_; dual::status_t lp_status = dual_phase2_with_advanced_basis(2, 0, @@ -3213,7 +3682,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( worker.nonbasic_list, worker.leaf_solution, node_iter, - leaf_edge_norms, + worker.leaf_edge_norms, &worker.work_context); if (lp_status == dual::status_t::NUMERICAL) { @@ -3226,18 +3695,20 @@ node_status_t branch_and_bound_t::solve_node_deterministic( worker.basic_list, worker.nonbasic_list, leaf_vstatus, - leaf_edge_norms, + worker.leaf_edge_norms, &worker.work_context); lp_status = convert_lp_status_to_dual_status(second_status); } - double work_performed = worker.work_context.global_work_units_elapsed - work_units_at_start; - worker.clock += work_performed; - exploration_stats_.total_lp_solve_time += toc(lp_start_time); exploration_stats_.total_lp_iters += node_iter; - ++exploration_stats_.nodes_explored; - --exploration_stats_.nodes_unexplored; + + bool lp_conclusive = + (lp_status != dual::status_t::TIME_LIMIT && lp_status != dual::status_t::WORK_LIMIT); + if (lp_conclusive) { + ++exploration_stats_.nodes_explored; + --exploration_stats_.nodes_unexplored; + } deterministic_bfs_policy_t policy{*this, worker}; auto [status, round_dir] = update_tree_impl(node_ptr, search_tree, &worker, lp_status, policy); @@ -3247,58 +3718,17 @@ node_status_t branch_and_bound_t::solve_node_deterministic( template template -void branch_and_bound_t::deterministic_process_worker_solutions( - PoolT& pool, WorkerTypeGetter get_worker_type) +void branch_and_bound_t::deterministic_collect_worker_solutions( + PoolT& pool, + WorkerTypeGetter get_worker_type, + std::vector::deterministic_replay_solution_t>& + replay_solutions) { - std::vector*> all_solutions; for (auto& worker : pool) { for (auto& sol : worker.integer_solutions) { - all_solutions.push_back(&sol); + const search_strategy_t strategy = get_worker_type(pool, sol.worker_id); + replay_solutions.push_back({std::move(sol), strategy}); } - } - - // relies on queued_integer_solution_t's operator< - // sorts based on objective first, then the tuple - std::sort(all_solutions.begin(), - all_solutions.end(), - [](const queued_integer_solution_t* a, - const queued_integer_solution_t* b) { return *a < *b; }); - - f_t deterministic_lower = deterministic_compute_lower_bound(); - f_t current_upper = upper_bound_.load(); - - for (const auto* sol : all_solutions) { - if (sol->objective < current_upper) { - f_t user_obj = compute_user_objective(original_lp_, sol->objective); - f_t user_lower = compute_user_objective(original_lp_, deterministic_lower); - i_t nodes_explored = exploration_stats_.nodes_explored.load(); - i_t nodes_unexplored = exploration_stats_.nodes_unexplored.load(); - - search_strategy_t worker_type = get_worker_type(pool, sol->worker_id); - report(feasible_solution_symbol(worker_type), - sol->objective, - deterministic_lower, - sol->depth, - 0, - deterministic_current_horizon_); - - bool improved = false; - if (improves_incumbent(sol->objective)) { - upper_bound_ = std::min(upper_bound_.load(), sol->objective); - incumbent_.set_incumbent_solution(sol->objective, sol->solution); - current_upper = sol->objective; - improved = true; - } - - if (improved && settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, sol->solution, original_x); - settings_.solution_callback(original_x, sol->objective); - } - } - } - - for (auto& worker : pool) { worker.integer_solutions.clear(); } } @@ -3308,12 +3738,17 @@ template void branch_and_bound_t::deterministic_merge_pseudo_cost_updates(PoolT& pool) { std::vector> all_pc_updates; + int64_t sb_iter_delta = 0; + int64_t base_sb = pc_.strong_branching_lp_iter.load(); for (auto& worker : pool) { auto updates = worker.pc_snapshot.take_updates(); all_pc_updates.insert(all_pc_updates.end(), updates.begin(), updates.end()); + int64_t snapshot_sb = worker.pc_snapshot.strong_branching_lp_iter_; + sb_iter_delta += snapshot_sb - base_sb; } std::sort(all_pc_updates.begin(), all_pc_updates.end()); pc_.merge_updates(all_pc_updates); + pc_.strong_branching_lp_iter += sb_iter_delta; } template @@ -3324,6 +3759,7 @@ void branch_and_bound_t::deterministic_broadcast_snapshots( deterministic_snapshot_t snap; snap.upper_bound = upper_bound_.load(); snap.total_lp_iters = exploration_stats_.total_lp_iters.load(); + snap.nodes_explored = exploration_stats_.nodes_explored.load(); snap.incumbent = incumbent_snapshot; snap.pc_snapshot = pc_.create_snapshot(); @@ -3334,91 +3770,158 @@ void branch_and_bound_t::deterministic_broadcast_snapshots( template void branch_and_bound_t::deterministic_sort_replay_events( - const bb_event_batch_t& events) + const bb_event_batch_t& events, + std::vector::deterministic_replay_solution_t>& + replay_solutions) { - // Infeasible solutions from GPU heuristics are queued for repair; process them now + // Retire external solutions that have reached the current horizon. Feasibility + // classification and repair happen only here in deterministic mode. { - std::vector> to_repair; - // TODO: support repair queue in deterministic mode - // mutex_repair_.lock(); - // if (repair_queue_.size() > 0) { - // to_repair = repair_queue_; - // repair_queue_.clear(); - // } - // mutex_repair_.unlock(); - - std::sort(to_repair.begin(), - to_repair.end(), - [](const std::vector& a, const std::vector& b) { return a < b; }); - - if (to_repair.size() > 0) { - settings_.log.debug("Deterministic sync: Attempting to repair %ld injected solutions\n", - to_repair.size()); - for (const std::vector& uncrushed_solution : to_repair) { - std::vector crushed_solution; - crush_primal_solution( - original_problem_, original_lp_, uncrushed_solution, new_slacks_, crushed_solution); - std::vector repaired_solution; - f_t repaired_obj; - bool success = - repair_solution(edge_norms_, crushed_solution, repaired_obj, repaired_solution); - if (success) { - // Queue repaired solution with work unit timestamp (...workstamp?) - mutex_heuristic_queue_.lock(); - heuristic_solution_queue_.push_back( - {repaired_obj, std::move(repaired_solution), 0, -1, 0, deterministic_current_horizon_}); - mutex_heuristic_queue_.unlock(); + std::vector due_solutions; + mutex_heuristic_queue_.lock(); + { + std::vector future_solutions; + for (auto& sol : heuristic_solution_queue_) { + if (sol.work_timestamp <= deterministic_current_horizon_) { + due_solutions.push_back(std::move(sol)); + } else { + future_solutions.push_back(std::move(sol)); } } + heuristic_solution_queue_ = std::move(future_solutions); } - } - - // Extract heuristic solutions, keeping future solutions for next horizon - // Use deterministic_current_horizon_ as the upper bound (horizon_end) - std::vector> heuristic_solutions; - mutex_heuristic_queue_.lock(); - { - std::vector> future_solutions; - for (auto& sol : heuristic_solution_queue_) { - if (sol.work_timestamp < deterministic_current_horizon_) { - heuristic_solutions.push_back(std::move(sol)); - } else { - future_solutions.push_back(std::move(sol)); + mutex_heuristic_queue_.unlock(); + + std::sort(due_solutions.begin(), + due_solutions.end(), + [](const queued_external_solution_t& a, const queued_external_solution_t& b) { + if (a.work_timestamp != b.work_timestamp) { + return a.work_timestamp < b.work_timestamp; + } + if (a.user_objective != b.user_objective) { + return a.user_objective < b.user_objective; + } + if (a.origin != b.origin) { return a.origin < b.origin; } + return a.solution < b.solution; + }); + + if (!due_solutions.empty() || !heuristic_solution_queue_.empty()) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic sync retire: horizon=%.6f due=%zu future=%zu pre_expl=%.6f\n", + deterministic_current_horizon_, + due_solutions.size(), + heuristic_solution_queue_.size(), + pre_exploration_work_); + for (size_t i = 0; i < due_solutions.size(); ++i) { + CUOPT_DETERMINISM_LOG( + settings_.log, + " due[%zu]: wut=%.6f obj=%g origin=%s\n", + i, + due_solutions[i].work_timestamp, + due_solutions[i].user_objective, + cuopt::internals::mip_solution_origin_to_string(due_solutions[i].origin)); + } + } + if (!due_solutions.empty()) { + CUOPT_DETERMINISM_LOG(settings_.log, + "Deterministic sync: retiring %ld external solutions\n", + due_solutions.size()); + for (const auto& queued_solution : due_solutions) { + auto [feasible, obj, crushed] = retire_queued_solution(queued_solution); + if (feasible) { + replay_solutions.push_back({{obj, + std::move(crushed), + 0, + -1, + 0, + queued_solution.work_timestamp, + queued_solution.origin}, + search_strategy_t::BEST_FIRST}); + } } } - heuristic_solution_queue_ = std::move(future_solutions); } - mutex_heuristic_queue_.unlock(); + if (!replay_solutions.empty() || !heuristic_solution_queue_.empty()) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay extract: horizon=%.6f now=%zu future=%zu upper=%.16e\n", + deterministic_current_horizon_, + replay_solutions.size(), + heuristic_solution_queue_.size(), + upper_bound_.load()); + } - // sort by work unit timestamp, with objective and solution values as tie-breakers - std::sort( - heuristic_solutions.begin(), - heuristic_solutions.end(), - [](const queued_integer_solution_t& a, const queued_integer_solution_t& b) { - if (a.work_timestamp != b.work_timestamp) { return a.work_timestamp < b.work_timestamp; } - if (a.objective != b.objective) { return a.objective < b.objective; } - return a.solution < b.solution; // edge-case - lexicographical comparison - }); + // Sort the full replay stream by work unit timestamp, with stable deterministic tie-breakers. + std::sort(replay_solutions.begin(), replay_solutions.end(), [](const auto& a, const auto& b) { + if (a.solution.work_timestamp != b.solution.work_timestamp) { + return a.solution.work_timestamp < b.solution.work_timestamp; + } + if (a.solution.objective != b.solution.objective) { + return a.solution.objective < b.solution.objective; + } + if (a.solution.origin != b.solution.origin) { return a.solution.origin < b.solution.origin; } + if (a.solution.worker_id != b.solution.worker_id) { + return a.solution.worker_id < b.solution.worker_id; + } + if (a.solution.sequence_id != b.solution.sequence_id) { + return a.solution.sequence_id < b.solution.sequence_id; + } + return a.solution.solution < b.solution.solution; + }); - // Merge B&B events and heuristic solutions for unified timeline replay - size_t event_idx = 0; - size_t heuristic_idx = 0; + f_t deterministic_lower = deterministic_compute_lower_bound(); + f_t current_upper = upper_bound_.load(); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Sync replay begin: horizon=%.6f n_events=%zu n_solutions=%zu user_upper_before=%.16e\n", + deterministic_current_horizon_, + events.events.size(), + replay_solutions.size(), + compute_user_objective(original_lp_, current_upper)); + if (deterministic_current_horizon_ <= deterministic_horizon_step_) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic solution replay: candidates=%zu lower=%.16e upper_before=%.16e\n", + replay_solutions.size(), + deterministic_lower, + current_upper); + for (size_t i = 0; i < replay_solutions.size(); ++i) { + const auto& replay = replay_solutions[i]; + const auto& sol = replay.solution; + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay solution[%zu]: wut=%.6f obj=%.16e origin=%s worker=%d seq=%d " + "depth=%d sol_hash=0x%x\n", + i, + sol.work_timestamp, + sol.objective, + cuopt::internals::mip_solution_origin_to_string(sol.origin), + sol.worker_id, + sol.sequence_id, + sol.depth, + detail::compute_hash(sol.solution)); + } + } - while (event_idx < events.events.size() || heuristic_idx < heuristic_solutions.size()) { - bool process_event = false; - bool process_heuristic = false; + // Merge B&B events and all incumbent-producing solutions for unified timeline replay. + size_t event_idx = 0; + size_t solution_idx = 0; + + while (event_idx < events.events.size() || solution_idx < replay_solutions.size()) { + bool process_event = false; + bool process_solution = false; if (event_idx >= events.events.size()) { - process_heuristic = true; - } else if (heuristic_idx >= heuristic_solutions.size()) { + process_solution = true; + } else if (solution_idx >= replay_solutions.size()) { process_event = true; } else { - // Both have items - pick the one with smaller WUT if (events.events[event_idx].work_timestamp <= - heuristic_solutions[heuristic_idx].work_timestamp) { + replay_solutions[solution_idx].solution.work_timestamp) { process_event = true; } else { - process_heuristic = true; + process_solution = true; } } @@ -3433,42 +3936,80 @@ void branch_and_bound_t::deterministic_sort_replay_events( } } - if (process_heuristic) { - const auto& hsol = heuristic_solutions[heuristic_idx++]; - - CUOPT_LOG_TRACE( - "Deterministic sync: Heuristic solution received at WUT %f with objective %g, current " - "horizon %f", - hsol.work_timestamp, - hsol.objective, - deterministic_current_horizon_); - - // Process heuristic solution at its correct work unit timestamp position - f_t new_upper = std::numeric_limits::infinity(); + if (process_solution) { + const auto& replay = replay_solutions[solution_idx++]; + const auto& sol = replay.solution; + bool improved = false; - if (improves_incumbent(hsol.objective)) { - upper_bound_ = std::min(upper_bound_.load(), hsol.objective); - incumbent_.set_incumbent_solution(hsol.objective, hsol.solution); - new_upper = hsol.objective; + if (improves_incumbent(sol.objective)) { + const f_t previous_upper = upper_bound_; + upper_bound_ = std::min(upper_bound_.load(), sol.objective); + incumbent_.set_incumbent_solution(sol.objective, sol.solution); + current_upper = sol.objective; + improved = true; + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=det_replay prev_upper=%.16e " + "new_upper=%.16e obj=%.16e hash=0x%x worker=%d seq=%d wut=%.6f horizon=%.6f\n", + previous_upper, + upper_bound_.load(), + sol.objective, + detail::compute_hash(sol.solution), + sol.worker_id, + sol.sequence_id, + sol.work_timestamp, + deterministic_current_horizon_); } - - if (new_upper < std::numeric_limits::infinity()) { - report_heuristic(new_upper); - - if (settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, hsol.solution, original_x); - settings_.solution_callback(original_x, hsol.objective); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay: horizon=%.6f wut=%.6f obj=%.16e origin=%s accepted=%d " + "upper_now=%.16e worker=%d seq=%d sol_hash=0x%x\n", + deterministic_current_horizon_, + sol.work_timestamp, + sol.objective, + cuopt::internals::mip_solution_origin_to_string(sol.origin), + (int)improved, + current_upper, + sol.worker_id, + sol.sequence_id, + detail::compute_hash(sol.solution)); + + if (improved) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay PUBLISH: horizon=%.6f wut=%.6f obj=%g origin=%s worker=%d " + "upper_after=%.16e\n", + deterministic_current_horizon_, + sol.work_timestamp, + compute_user_objective(original_lp_, sol.objective), + cuopt::internals::mip_solution_origin_to_string(sol.origin), + sol.worker_id, + current_upper); + if (sol.origin == cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE || + sol.origin == cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING) { + report(feasible_solution_symbol(replay.strategy), + sol.objective, + deterministic_lower, + sol.depth, + 0, + deterministic_current_horizon_); + } else { + report_heuristic(sol.objective, sol.work_timestamp); } + emit_solution_callback_from_crushed( + sol.solution, sol.objective, sol.origin, sol.work_timestamp); } } } - // Merge integer solutions from BFS workers and update global incumbent - deterministic_process_worker_solutions(*deterministic_workers_, - [](const deterministic_bfs_worker_pool_t&, int) { - return search_strategy_t::BEST_FIRST; - }); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Sync replay done: horizon=%.6f user_upper_after=%.16e events_processed=%zu " + "solutions_processed=%zu\n", + deterministic_current_horizon_, + compute_user_objective(original_lp_, upper_bound_.load()), + event_idx, + solution_idx); // Merge and apply pseudo-cost updates from BFS workers deterministic_merge_pseudo_cost_updates(*deterministic_workers_); @@ -3525,52 +4066,44 @@ void branch_and_bound_t::deterministic_balance_worker_loads() constexpr bool force_rebalance_every_sync = false; - // Count work for each worker: current_node (if any) + plunge_stack + backlog - std::vector work_counts(num_workers); - size_t total_work = 0; - size_t max_work = 0; - size_t min_work = std::numeric_limits::max(); + std::vector backlog_counts(num_workers); + size_t total_backlog = 0; + size_t max_backlog = 0; + size_t min_backlog = std::numeric_limits::max(); for (size_t w = 0; w < num_workers; ++w) { - auto& worker = (*deterministic_workers_)[w]; - work_counts[w] = worker.queue_size(); - total_work += work_counts[w]; - max_work = std::max(max_work, work_counts[w]); - min_work = std::min(min_work, work_counts[w]); + auto& worker = (*deterministic_workers_)[w]; + backlog_counts[w] = worker.backlog.size(); + total_backlog += backlog_counts[w]; + max_backlog = std::max(max_backlog, backlog_counts[w]); + min_backlog = std::min(min_backlog, backlog_counts[w]); } - if (total_work == 0) return; + if (total_backlog == 0) return; bool needs_balance; if (force_rebalance_every_sync) { - needs_balance = (total_work > 1); + needs_balance = (total_backlog > 1); } else { - needs_balance = (min_work == 0 && max_work >= 2) || (min_work > 0 && max_work > 4 * min_work); + needs_balance = + (min_backlog == 0 && max_backlog >= 2) || (min_backlog > 0 && max_backlog > 4 * min_backlog); } if (!needs_balance) return; - std::vector*> all_nodes; + std::vector*> all_backlog_nodes; for (auto& worker : *deterministic_workers_) { for (auto* node : worker.backlog.data()) { - all_nodes.push_back(node); + all_backlog_nodes.push_back(node); } worker.backlog.clear(); } - if (all_nodes.empty()) return; - - auto deterministic_less = [](const mip_node_t* a, const mip_node_t* b) { - if (a->origin_worker_id != b->origin_worker_id) { - return a->origin_worker_id < b->origin_worker_id; - } - return a->creation_seq < b->creation_seq; - }; - std::sort(all_nodes.begin(), all_nodes.end(), deterministic_less); + if (all_backlog_nodes.empty()) return; - // Distribute nodes - for (size_t i = 0; i < all_nodes.size(); ++i) { + // Round-robin distribute into backlogs; priority queue handles ordering internally + for (size_t i = 0; i < all_backlog_nodes.size(); ++i) { size_t worker_idx = i % num_workers; - (*deterministic_workers_)[worker_idx].enqueue_node(all_nodes[i]); + (*deterministic_workers_)[worker_idx].backlog.push(all_backlog_nodes[i]); } } @@ -3598,11 +4131,33 @@ f_t branch_and_bound_t::deterministic_compute_lower_bound() } } + f_t min_from_workers = lower_bound; + // Tree is exhausted if (lower_bound == std::numeric_limits::infinity() && incumbent_.has_incumbent) { lower_bound = upper_bound_.load(); } + lower_bound = std::min(lower_bound, upper_bound_.load()); + + CUOPT_DETERMINISM_LOG( + settings_.log, + "compute_lower_bound: user_min_bfs=%.16e user_upper=%.16e user_result=%.16e " + "has_incumbent=%d n_bfs_nodes=%d\n", + compute_user_objective(original_lp_, min_from_workers), + compute_user_objective(original_lp_, upper_bound_.load()), + compute_user_objective(original_lp_, lower_bound), + (int)incumbent_.has_incumbent, + [&]() { + int count = 0; + for (const auto& w : *deterministic_workers_) { + count += (w.current_node != nullptr ? 1 : 0); + count += (int)w.plunge_stack.size(); + count += (int)w.backlog.size(); + } + return count; + }()); + return lower_bound; } @@ -3690,19 +4245,27 @@ void branch_and_bound_t::deterministic_assign_diving_nodes() } template -void branch_and_bound_t::deterministic_collect_diving_solutions_and_update_pseudocosts() +void branch_and_bound_t::deterministic_collect_diving_solutions_and_update_pseudocosts( + std::vector::deterministic_replay_solution_t>& + replay_solutions) { if (!deterministic_diving_workers_) return; - // Collect integer solutions from diving workers and update global incumbent - deterministic_process_worker_solutions( + deterministic_collect_worker_solutions( *deterministic_diving_workers_, [](const deterministic_diving_worker_pool_t& pool, int worker_id) { return pool[worker_id].diving_type; - }); + }, + replay_solutions); // Merge pseudo-cost updates from diving workers deterministic_merge_pseudo_cost_updates(*deterministic_diving_workers_); + + for (auto& worker : *deterministic_diving_workers_) { + i_t delta = worker.total_nodes_explored - worker.nodes_explored_last_sync; + worker.nodes_explored_last_sync = worker.total_nodes_explored; + exploration_stats_.nodes_explored += delta; + } } template @@ -3777,7 +4340,12 @@ void branch_and_bound_t::deterministic_dive( // Setup LP settings simplex_solver_settings_t lp_settings = settings_; lp_settings.set_log(false); - lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + if (original_lp_.objective_is_integral) { + lp_settings.cut_off = + std::ceil(worker.local_upper_bound - settings_.integer_tol) + settings_.dual_tol; + } else { + lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + } lp_settings.inside_mip = 2; lp_settings.time_limit = remaining_time; lp_settings.scale_columns = false; @@ -3787,7 +4355,6 @@ void branch_and_bound_t::deterministic_dive( lp_settings, worker.bounds_changed, worker.leaf_problem.lower, worker.leaf_problem.upper); if (settings_.deterministic) { - // TEMP APPROXIMATION; worker.work_context.record_work_sync_on_horizon(worker.node_presolver.last_nnz_processed / 1e8); } @@ -3841,17 +4408,16 @@ void branch_and_bound_t::deterministic_dive( lp_status = convert_lp_status_to_dual_status(second_status); } - ++nodes_this_dive; - ++worker.total_nodes_explored; worker.lp_iters_this_dive += node_iter; - worker.clock = worker.work_context.global_work_units_elapsed; - if (lp_status == dual::status_t::TIME_LIMIT || lp_status == dual::status_t::WORK_LIMIT || lp_status == dual::status_t::ITERATION_LIMIT) { break; } + ++nodes_this_dive; + ++worker.total_nodes_explored; + deterministic_diving_policy_t policy{*this, worker, stack, max_backtrack_depth}; update_tree_impl(node_ptr, dive_tree, &worker, lp_status, policy); } diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp index f2917ba930..7dec38b640 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.hpp +++ b/cpp/src/branch_and_bound/branch_and_bound.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -35,9 +36,12 @@ #include #include +#include #include #include #include +#include +#include #include namespace cuopt::linear_programming::detail { @@ -108,10 +112,15 @@ class branch_and_bound_t { } // Set a solution based on the user problem during the course of the solve - void set_new_solution(const std::vector& solution); + void set_new_solution(const std::vector& solution, + cuopt::internals::mip_solution_origin_t origin = + cuopt::internals::mip_solution_origin_t::UNKNOWN); // This queues the solution to be processed at the correct work unit timestamp - void queue_external_solution_deterministic(const std::vector& solution, double work_unit_ts); + void queue_external_solution_deterministic(const std::vector& solution, + f_t user_objective, + double work_unit_ts, + cuopt::internals::mip_solution_origin_t origin); void set_user_bound_callback(std::function callback) { @@ -157,6 +166,12 @@ class branch_and_bound_t { // Get producer sync for external heuristics (e.g., CPUFJ) to register producer_sync_t& get_producer_sync() { return producer_sync_; } + void wait_for_exploration_start() + { + std::unique_lock lock(exploration_started_mutex_); + exploration_started_cv_.wait(lock, [this] { return exploration_started_.load(); }); + } + private: const user_problem_t& original_problem_; const simplex_solver_settings_t settings_; @@ -166,6 +181,10 @@ class branch_and_bound_t { std::atomic signal_extend_cliques_{false}; work_limit_context_t work_unit_context_{"B&B"}; + double pre_exploration_work_{0.0}; + std::atomic exploration_started_{false}; + std::mutex exploration_started_mutex_; + std::condition_variable exploration_started_cv_; // Initial guess. std::vector guess_; @@ -214,7 +233,13 @@ class branch_and_bound_t { // Mutex for repair omp_mutex_t mutex_repair_; - std::vector> repair_queue_; + struct queued_repair_solution_t { + std::vector solution; + cuopt::internals::mip_solution_origin_t origin{ + cuopt::internals::mip_solution_origin_t::UNKNOWN}; + double work_timestamp{-1.0}; + }; + std::vector repair_queue_; // Variables for the root node in the search tree. std::vector root_vstatus_; @@ -262,13 +287,21 @@ class branch_and_bound_t { omp_atomic_t lower_bound_ceiling_; std::function user_bound_callback_; - void report_heuristic(f_t obj); + void report_heuristic(f_t obj, double work_time = -1.0); void report(char symbol, f_t obj, f_t lower_bound, i_t node_depth, i_t node_int_infeas, double work_time = -1); + void emit_solution_callback(std::vector& original_x, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp); + void emit_solution_callback_from_crushed(const std::vector& crushed_solution, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp); // Set the solution when found at the root node void set_solution_at_root(mip_solution_t& solution, @@ -341,7 +374,14 @@ class branch_and_bound_t { void run_deterministic_coordinator(const csr_matrix_t& Arow); // Gather all events generated, sort by WU timestamp, apply - void deterministic_sort_replay_events(const bb_event_batch_t& events); + struct deterministic_replay_solution_t { + queued_integer_solution_t solution; + search_strategy_t strategy{search_strategy_t::BEST_FIRST}; + }; + + void deterministic_sort_replay_events( + const bb_event_batch_t& events, + std::vector& replay_solutions); // Prune nodes held by workers based on new incumbent void deterministic_prune_worker_nodes_vs_incumbent(); @@ -374,10 +414,14 @@ class branch_and_bound_t { void deterministic_assign_diving_nodes(); // Collect and merge diving solutions at sync - void deterministic_collect_diving_solutions_and_update_pseudocosts(); + void deterministic_collect_diving_solutions_and_update_pseudocosts( + std::vector& replay_solutions); template - void deterministic_process_worker_solutions(PoolT& pool, WorkerTypeGetter get_worker_type); + void deterministic_collect_worker_solutions( + PoolT& pool, + WorkerTypeGetter get_worker_type, + std::vector& replay_solutions); template void deterministic_merge_pseudo_cost_updates(PoolT& pool); @@ -408,10 +452,22 @@ class branch_and_bound_t { double max_producer_wait_time_{0.0}; i_t producer_wait_count_{0}; - // Determinism heuristic solution queue - solutions received from GPU heuristics - // Stored with work unit timestamp for deterministic ordering + struct queued_external_solution_t { + std::vector solution; + f_t user_objective{std::numeric_limits::infinity()}; + double work_timestamp{0.0}; + cuopt::internals::mip_solution_origin_t origin{ + cuopt::internals::mip_solution_origin_t::UNKNOWN}; + }; + + std::tuple> retire_queued_solution( + const queued_external_solution_t& queued_solution); + + // Deterministic pending external solution queue. + // External solutions stay raw until their retirement horizon, where they are + // crushed, checked, and repaired immediately if needed. omp_mutex_t mutex_heuristic_queue_; - std::vector> heuristic_solution_queue_; + std::vector heuristic_solution_queue_; // ============================================================================ // Determinism Diving state diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp index 7a074051c6..b90706285b 100644 --- a/cpp/src/branch_and_bound/deterministic_workers.hpp +++ b/cpp/src/branch_and_bound/deterministic_workers.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -44,6 +45,8 @@ struct queued_integer_solution_t { int worker_id{-1}; int sequence_id{0}; double work_timestamp{0.0}; + cuopt::internals::mip_solution_origin_t origin{ + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE}; bool operator<(const queued_integer_solution_t& other) const { @@ -59,6 +62,7 @@ struct deterministic_snapshot_t { pseudo_cost_snapshot_t pc_snapshot; std::vector incumbent; i_t total_lp_iters; + i_t nodes_explored; }; template @@ -66,7 +70,6 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { using base_t = branch_and_bound_worker_t; public: - double clock{0.0}; work_limit_context_t work_context; pseudo_cost_snapshot_t pc_snapshot; @@ -75,6 +78,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { // Diving-specific snapshots (ignored by BFS workers) std::vector incumbent_snapshot; i_t total_lp_iters_snapshot{0}; + i_t nodes_explored_snapshot{0}; std::vector> integer_solutions; int next_solution_seq{0}; @@ -101,6 +105,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { pc_snapshot = snap.pc_snapshot; incumbent_snapshot = snap.incumbent; total_lp_iters_snapshot = snap.total_lp_iters; + nodes_explored_snapshot = snap.nodes_explored; } bool has_work() const { return static_cast(this)->has_work_impl(); } @@ -158,11 +163,6 @@ class deterministic_bfs_worker_t mip_node_t* up_child, rounding_direction_t preferred_direction) { - if (!plunge_stack.empty()) { - backlog.push(plunge_stack.back()); - plunge_stack.pop_back(); - } - down_child->origin_worker_id = this->worker_id; down_child->creation_seq = next_creation_seq++; up_child->origin_worker_id = this->worker_id; @@ -170,11 +170,11 @@ class deterministic_bfs_worker_t mip_node_t* first_child; if (preferred_direction == rounding_direction_t::UP) { - plunge_stack.push_front(down_child); + backlog.push(down_child); plunge_stack.push_front(up_child); first_child = up_child; } else { - plunge_stack.push_front(up_child); + backlog.push(up_child); plunge_stack.push_front(down_child); first_child = down_child; } @@ -211,7 +211,7 @@ class deterministic_bfs_worker_t void record_branched( mip_node_t* node, i_t down_child_id, i_t up_child_id, i_t branch_var, f_t branch_val) { - record_event(bb_event_t::make_branched(this->clock, + record_event(bb_event_t::make_branched(this->work_context.current_work(), this->worker_id, node->creation_seq, down_child_id, @@ -227,7 +227,7 @@ class deterministic_bfs_worker_t void record_integer_solution(mip_node_t* node, f_t objective) { record_event(bb_event_t::make_integer_solution( - this->clock, this->worker_id, node->creation_seq, objective)); + this->work_context.current_work(), this->worker_id, node->creation_seq, objective)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; ++this->total_integer_solutions; @@ -236,7 +236,7 @@ class deterministic_bfs_worker_t void record_fathomed(mip_node_t* node, f_t lower_bound) { record_event(bb_event_t::make_fathomed( - this->clock, this->worker_id, node->creation_seq, lower_bound)); + this->work_context.current_work(), this->worker_id, node->creation_seq, lower_bound)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; ++total_nodes_pruned; @@ -244,8 +244,8 @@ class deterministic_bfs_worker_t void record_infeasible(mip_node_t* node) { - record_event( - bb_event_t::make_infeasible(this->clock, this->worker_id, node->creation_seq)); + record_event(bb_event_t::make_infeasible( + this->work_context.current_work(), this->worker_id, node->creation_seq)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; ++total_nodes_infeasible; @@ -253,8 +253,8 @@ class deterministic_bfs_worker_t void record_numerical(mip_node_t* node) { - record_event( - bb_event_t::make_numerical(this->clock, this->worker_id, node->creation_seq)); + record_event(bb_event_t::make_numerical( + this->work_context.current_work(), this->worker_id, node->creation_seq)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; } @@ -288,6 +288,7 @@ class deterministic_diving_worker_t // Diving statistics i_t total_nodes_explored{0}; + i_t nodes_explored_last_sync{0}; i_t total_dives{0}; i_t lp_iters_this_dive{0}; @@ -339,7 +340,13 @@ class deterministic_diving_worker_t void queue_integer_solution(f_t objective, const std::vector& solution, i_t depth) { this->integer_solutions.push_back( - {objective, solution, depth, this->worker_id, this->next_solution_seq++}); + {objective, + solution, + depth, + this->worker_id, + this->next_solution_seq++, + this->work_context.current_work(), + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING}); ++this->total_integer_solutions; } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index c38e98e27d..bddda3ae78 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -30,7 +30,8 @@ namespace { static bool is_dual_simplex_done(dual::status_t status) { return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || - status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF; + status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::WORK_LIMIT || + status == dual::status_t::CUTOFF; } template @@ -303,7 +304,8 @@ void strong_branch_helper(i_t start, std::vector& dual_simplex_obj_up, std::vector& dual_simplex_status_down, std::vector& dual_simplex_status_up, - shared_strong_branching_context_view_t& sb_view) + shared_strong_branching_context_view_t& sb_view, + cuopt::work_limit_context_t* work_unit_context = nullptr) { raft::common::nvtx::range scope("BB::strong_branch_helper"); lp_problem_t child_problem = original_lp; @@ -361,14 +363,15 @@ void strong_branch_helper(i_t start, vstatus, solution, iter, - child_edge_norms); + child_edge_norms, + work_unit_context); f_t obj = std::numeric_limits::quiet_NaN(); if (status == dual::status_t::DUAL_UNBOUNDED) { // LP was infeasible obj = std::numeric_limits::infinity(); } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || - status == dual::status_t::CUTOFF) { + status == dual::status_t::WORK_LIMIT || status == dual::status_t::CUTOFF) { obj = compute_objective(child_problem, solution.x); } else { settings.log.debug("Thread id %2d remaining %d variable %d branch %d status %d\n", @@ -463,7 +466,8 @@ std::pair trial_branching(const lp_problem_t& ori f_t upper_bound, f_t start_time, i_t iter_limit, - omp_atomic_t& total_lp_iter) + omp_atomic_t& total_lp_iter, + cuopt::work_limit_context_t* work_ctx = nullptr) { lp_problem_t child_problem = original_lp; child_problem.lower[branch_var] = branch_var_lower; @@ -501,7 +505,8 @@ std::pair trial_branching(const lp_problem_t& ori child_nonbasic_list, solution, iter, - child_edge_norms); + child_edge_norms, + work_ctx); total_lp_iter += iter; settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n", branch_var, @@ -515,13 +520,95 @@ std::pair trial_branching(const lp_problem_t& ori // LP was infeasible return {std::numeric_limits::infinity(), dual::status_t::DUAL_UNBOUNDED}; } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || - status == dual::status_t::CUTOFF) { + status == dual::status_t::WORK_LIMIT || status == dual::status_t::CUTOFF) { return {compute_objective(child_problem, solution.x), status}; } else { return {std::numeric_limits::quiet_NaN(), dual::status_t::NUMERICAL}; } } +template +f_t trial_branching_generic(const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& vstatus, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + i_t branch_var, + f_t branch_var_lower, + f_t branch_var_upper, + f_t upper_bound, + i_t bnb_lp_iter_per_node, + f_t start_time, + i_t upper_max_lp_iter, + i_t lower_max_lp_iter, + omp_atomic_t& total_lp_iter, + cuopt::work_limit_context_t* work_ctx = nullptr) +{ + const i_t iter_limit = std::clamp(bnb_lp_iter_per_node, lower_max_lp_iter, upper_max_lp_iter); + return trial_branching(original_lp, + settings, + var_types, + vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + branch_var, + branch_var_lower, + branch_var_upper, + upper_bound, + start_time, + iter_limit, + total_lp_iter, + work_ctx) + .first; +} + +template +f_t trial_branching_generic(const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& vstatus, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + i_t branch_var, + f_t branch_var_lower, + f_t branch_var_upper, + f_t upper_bound, + i_t bnb_lp_iter_per_node, + f_t start_time, + i_t upper_max_lp_iter, + i_t lower_max_lp_iter, + int64_t& total_lp_iter, + cuopt::work_limit_context_t* work_ctx = nullptr) +{ + omp_atomic_t atomic_iter{0}; + auto result = + trial_branching(original_lp, + settings, + var_types, + vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + branch_var, + branch_var_lower, + branch_var_upper, + upper_bound, + start_time, + std::clamp(bnb_lp_iter_per_node, lower_max_lp_iter, upper_max_lp_iter), + atomic_iter, + work_ctx); + total_lp_iter += atomic_iter.load(); + return result.first; +} + } // namespace template @@ -997,7 +1084,8 @@ void strong_branching(const lp_problem_t& original_lp, const std::vector& basic_list, const std::vector& nonbasic_list, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc) + pseudo_costs_t& pc, + cuopt::work_limit_context_t* work_unit_context) { constexpr bool verbose = false; @@ -1006,17 +1094,19 @@ void strong_branching(const lp_problem_t& original_lp, pc.strong_branch_up.assign(fractional.size(), 0); pc.num_strong_branches_completed = 0; - const f_t elapsed_time = toc(start_time); - if (elapsed_time > settings.time_limit) { return; } + if (fractional.empty()) { return; } + if (toc(start_time) > settings.time_limit) { return; } + const bool deterministic_work_accounting = + work_unit_context != nullptr && work_unit_context->deterministic; + cuopt_assert(settings.deterministic == deterministic_work_accounting, + "inconsistent determinism state"); + const bool disable_batch_pdlp = settings.sub_mip || settings.deterministic; // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only const i_t effective_batch_pdlp = - (settings.sub_mip || (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1)) - ? 0 - : settings.mip_batch_pdlp_strong_branching; + disable_batch_pdlp ? 0 : settings.mip_batch_pdlp_strong_branching; - if (settings.mip_batch_pdlp_strong_branching != 0 && - (settings.sub_mip || settings.deterministic)) { + if (settings.mip_batch_pdlp_strong_branching != 0 && disable_batch_pdlp) { settings.log.printf( "Batch PDLP strong branching is disabled because sub-MIP or deterministic mode is enabled\n"); } @@ -1040,6 +1130,15 @@ void strong_branching(const lp_problem_t& original_lp, std::vector dual_simplex_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); f_t strong_branching_start_time = tic(); i_t simplex_iteration_limit = settings.strong_branching_simplex_iteration_limit; + const i_t n_tasks = std::min(4 * settings.num_threads, fractional.size()); + std::vector task_work_contexts; + if (deterministic_work_accounting) { + task_work_contexts.reserve(n_tasks); + for (i_t k = 0; k < n_tasks; ++k) { + task_work_contexts.emplace_back("sb_task_" + std::to_string(k)); + task_work_contexts.back().deterministic = true; + } + } if (simplex_iteration_limit < 1) { initialize_pseudo_costs_with_estimate(original_lp, @@ -1074,13 +1173,12 @@ void strong_branching(const lp_problem_t& original_lp, } if (effective_batch_pdlp != 2) { - i_t n = std::min(4 * settings.num_threads, fractional.size()); // Here we are creating more tasks than the number of threads // such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) - for (i_t k = 0; k < n; k++) { - i_t start = std::floor(k * fractional.size() / n); - i_t end = std::floor((k + 1) * fractional.size() / n); +#pragma omp taskloop num_tasks(n_tasks) + for (i_t k = 0; k < n_tasks; k++) { + i_t start = std::floor(k * fractional.size() / n_tasks); + i_t end = std::floor((k + 1) * fractional.size() / n_tasks); constexpr bool verbose = false; if (verbose) { @@ -1110,13 +1208,21 @@ void strong_branching(const lp_problem_t& original_lp, dual_simplex_obj_up, dual_simplex_status_down, dual_simplex_status_up, - sb_view); + sb_view, + deterministic_work_accounting ? &task_work_contexts[k] : nullptr); } // DS done: signal PDLP to stop (time-limit or all work done) and wait if (effective_batch_pdlp == 1) { concurrent_halt.store(1); } } } } + if (deterministic_work_accounting) { + double max_work = 0.0; + for (const auto& ctx : task_work_contexts) { + max_work = std::max(max_work, ctx.current_work()); + } + work_unit_context->record_work_sync_on_horizon(max_work); + } } settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time)); @@ -1777,6 +1883,186 @@ i_t pseudo_costs_t::reliable_variable_selection( return branch_var; } +template +i_t reliable_variable_selection_core(mip_node_t* node_ptr, + const std::vector& fractional, + const std::vector& solution, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const lp_problem_t& leaf_problem, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + SumT* sum_down, + SumT* sum_up, + CountT* num_down, + CountT* num_up, + i_t n_vars, + SBIterT& strong_branching_lp_iter, + f_t upper_bound, + int64_t bnb_lp_iters, + int64_t bnb_nodes_explored, + f_t start_time, + const reliability_branching_settings_t& rb_settings, + int num_tasks, + omp_mutex_t* var_mutex_down, + omp_mutex_t* var_mutex_up, + pcgenerator_t* rng, + cuopt::work_limit_context_t* work_ctx, + const sb_update_callback_t& on_sb_update) +{ + constexpr f_t eps = 1e-6; + i_t branch_var = fractional[0]; + f_t max_score = -1; + + auto avgs = compute_pseudo_cost_averages(sum_down, sum_up, num_down, num_up, (size_t)n_vars); + const f_t pseudo_cost_down_avg = avgs.down_avg; + const f_t pseudo_cost_up_avg = avgs.up_avg; + + const i_t bnb_lp_iter_per_node = + bnb_nodes_explored > 0 ? (i_t)(bnb_lp_iters / bnb_nodes_explored) : 0; + + i_t reliable_threshold = settings.reliability_branching; + if (reliable_threshold < 0) { + const int64_t alpha = (int64_t)(rb_settings.bnb_lp_factor * bnb_lp_iters); + const int64_t max_reliability_iter = alpha + rb_settings.bnb_lp_offset; + + f_t iter_fraction = + (max_reliability_iter - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0); + iter_fraction = std::min(1.0, iter_fraction); + iter_fraction = std::max((alpha - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0), + iter_fraction); + reliable_threshold = (int)((1 - iter_fraction) * rb_settings.min_reliable_threshold + + iter_fraction * rb_settings.max_reliable_threshold); + reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0; + } + + std::vector unreliable_list; + for (i_t j : fractional) { + if (num_down[j] < reliable_threshold || num_up[j] < reliable_threshold) { + unreliable_list.push_back(j); + continue; + } + const f_t pc_down = num_down[j] > 0 ? sum_down[j] / num_down[j] : pseudo_cost_down_avg; + const f_t pc_up = num_up[j] > 0 ? sum_up[j] / num_up[j] : pseudo_cost_up_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + const f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + if (score > max_score) { + max_score = score; + branch_var = j; + } + } + + if (unreliable_list.empty()) { + settings.log.debug( + "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score); + return branch_var; + } + + const i_t max_num_candidates = rb_settings.max_num_candidates; + const int task_priority = rb_settings.task_priority; + const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); + + cuopt_assert(rng != nullptr, "rng must be provided for candidate shuffling"); + if (unreliable_list.size() > (size_t)max_num_candidates) { rng->shuffle(unreliable_list); } + if (toc(start_time) > settings.time_limit) { return branch_var; } + + omp_mutex_t score_mutex; + +#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ + shared(score_mutex, strong_branching_lp_iter) + for (i_t i = 0; i < num_candidates; ++i) { + const i_t j = unreliable_list[i]; + if (toc(start_time) > settings.time_limit) { continue; } + + if (var_mutex_down) { var_mutex_down[j].lock(); } + if (num_down[j] < reliable_threshold) { + const f_t obj = trial_branching_generic(leaf_problem, + settings, + var_types, + node_ptr->vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + j, + leaf_problem.lower[j], + std::floor(solution[j]), + upper_bound, + bnb_lp_iter_per_node, + start_time, + rb_settings.upper_max_lp_iter, + rb_settings.lower_max_lp_iter, + strong_branching_lp_iter, + work_ctx); + if (!std::isnan(obj)) { + const f_t delta = + std::max(obj - node_ptr->lower_bound, eps) / (solution[j] - std::floor(solution[j])); + sum_down[j] += delta; + num_down[j]++; + if (on_sb_update) { on_sb_update(j, rounding_direction_t::DOWN, delta); } + } + } + if (var_mutex_down) { var_mutex_down[j].unlock(); } + + if (toc(start_time) > settings.time_limit) { continue; } + + if (var_mutex_up) { var_mutex_up[j].lock(); } + if (num_up[j] < reliable_threshold) { + const f_t obj = trial_branching_generic(leaf_problem, + settings, + var_types, + node_ptr->vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + j, + std::ceil(solution[j]), + leaf_problem.upper[j], + upper_bound, + bnb_lp_iter_per_node, + start_time, + rb_settings.upper_max_lp_iter, + rb_settings.lower_max_lp_iter, + strong_branching_lp_iter, + work_ctx); + if (!std::isnan(obj)) { + const f_t delta = + std::max(obj - node_ptr->lower_bound, eps) / (std::ceil(solution[j]) - solution[j]); + sum_up[j] += delta; + num_up[j]++; + if (on_sb_update) { on_sb_update(j, rounding_direction_t::UP, delta); } + } + } + if (var_mutex_up) { var_mutex_up[j].unlock(); } + + if (toc(start_time) > settings.time_limit) { continue; } + + const f_t pc_down = num_down[j] > 0 ? sum_down[j] / num_down[j] : pseudo_cost_down_avg; + const f_t pc_up = num_up[j] > 0 ? sum_up[j] / num_up[j] : pseudo_cost_up_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + const f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + + score_mutex.lock(); + if (score > max_score) { + max_score = score; + branch_var = j; + } + score_mutex.unlock(); + } + + settings.log.debug("Reliability branching result: node=%d branch_var=%d value=%e score=%e\n", + node_ptr->node_id, + branch_var, + solution[branch_var], + max_score); + return branch_var; +} + template f_t pseudo_costs_t::obj_estimate(const std::vector& fractional, const std::vector& solution, @@ -1814,20 +2100,15 @@ void pseudo_costs_t::update_pseudo_costs_from_strong_branching( { for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; - for (i_t branch = 0; branch < 2; branch++) { - if (branch == 0) { - f_t change_in_obj = strong_branch_down[k]; - if (std::isnan(change_in_obj)) { continue; } - f_t frac = root_soln[j] - std::floor(root_soln[j]); - pseudo_cost_sum_down[j] += change_in_obj / frac; - pseudo_cost_num_down[j]++; - } else { - f_t change_in_obj = strong_branch_up[k]; - if (std::isnan(change_in_obj)) { continue; } - f_t frac = std::ceil(root_soln[j]) - root_soln[j]; - pseudo_cost_sum_up[j] += change_in_obj / frac; - pseudo_cost_num_up[j]++; - } + if (!std::isnan(strong_branch_down[k])) { + const f_t frac = root_soln[j] - std::floor(root_soln[j]); + pseudo_cost_sum_down[j] += strong_branch_down[k] / frac; + pseudo_cost_num_down[j]++; + } + if (!std::isnan(strong_branch_up[k])) { + const f_t frac = std::ceil(root_soln[j]) - root_soln[j]; + pseudo_cost_sum_up[j] += strong_branch_up[k] / frac; + pseudo_cost_num_up[j]++; } } } @@ -1836,6 +2117,68 @@ void pseudo_costs_t::update_pseudo_costs_from_strong_branching( template class pseudo_costs_t; +template int reliable_variable_selection_core, + omp_atomic_t, + omp_atomic_t>( + mip_node_t*, + const std::vector&, + const std::vector&, + const simplex_solver_settings_t&, + const std::vector&, + const lp_problem_t&, + const std::vector&, + const basis_update_mpf_t&, + const std::vector&, + const std::vector&, + omp_atomic_t*, + omp_atomic_t*, + omp_atomic_t*, + omp_atomic_t*, + int, + omp_atomic_t&, + double, + int64_t, + int64_t, + double, + const reliability_branching_settings_t&, + int, + omp_mutex_t*, + omp_mutex_t*, + pcgenerator_t*, + cuopt::work_limit_context_t*, + const sb_update_callback_t&); + +template int reliable_variable_selection_core( + mip_node_t*, + const std::vector&, + const std::vector&, + const simplex_solver_settings_t&, + const std::vector&, + const lp_problem_t&, + const std::vector&, + const basis_update_mpf_t&, + const std::vector&, + const std::vector&, + double*, + double*, + int*, + int*, + int, + int64_t&, + double, + int64_t, + int64_t, + double, + const reliability_branching_settings_t&, + int, + omp_mutex_t*, + omp_mutex_t*, + pcgenerator_t*, + cuopt::work_limit_context_t*, + const sb_update_callback_t&); + template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, double start_time, @@ -1850,7 +2193,8 @@ template void strong_branching(const lp_problem_t& ori const std::vector& basic_list, const std::vector& nonbasic_list, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc); + pseudo_costs_t& pc, + cuopt::work_limit_context_t* work_unit_context); #endif diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 009bd8b81a..6393a8cd41 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -17,12 +17,14 @@ #include #include +#include #include #include #include #include +#include #include namespace cuopt::linear_programming::dual_simplex { @@ -357,6 +359,13 @@ class pseudo_cost_snapshot_t { } } + // Record an update that was already applied to the arrays (e.g. by strong branching). + void record_update( + i_t variable, rounding_direction_t direction, f_t delta, double clock, int worker_id) + { + updates_.push_back({variable, direction, delta, clock, worker_id}); + } + std::vector> take_updates() { std::vector> result; @@ -370,6 +379,7 @@ class pseudo_cost_snapshot_t { std::vector sum_up_; std::vector num_down_; std::vector num_up_; + int64_t strong_branching_lp_iter_{0}; private: std::vector> updates_; @@ -452,8 +462,10 @@ class pseudo_costs_t { nd[j] = pseudo_cost_num_down[j]; nu[j] = pseudo_cost_num_up[j]; } - return pseudo_cost_snapshot_t( - std::move(sd), std::move(su), std::move(nd), std::move(nu)); + auto snap = + pseudo_cost_snapshot_t(std::move(sd), std::move(su), std::move(nd), std::move(nu)); + snap.strong_branching_lp_iter_ = strong_branching_lp_iter.load(); + return snap; } void merge_updates(const std::vector>& updates) @@ -541,6 +553,44 @@ class pseudo_costs_t { batch_pdlp_warm_cache_t pdlp_warm_cache; }; +// Callback invoked after each strong-branching pseudocost discovery. +template +using sb_update_callback_t = + std::function; + +// Core reliability branching loop usable by both opportunistic and deterministic paths. +// When num_tasks == 1, runs serially with no locking (deterministic). +// When num_tasks > 1 with mutexes/rng, uses OMP taskloop (opportunistic). +// SumT/CountT can be f_t/i_t (deterministic snapshot) or omp_atomic_t/omp_atomic_t. +template +i_t reliable_variable_selection_core(mip_node_t* node_ptr, + const std::vector& fractional, + const std::vector& solution, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const lp_problem_t& leaf_problem, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + SumT* sum_down, + SumT* sum_up, + CountT* num_down, + CountT* num_up, + i_t n_vars, + SBIterT& strong_branching_lp_iter, + f_t upper_bound, + int64_t bnb_lp_iters, + int64_t bnb_nodes_explored, + f_t start_time, + const reliability_branching_settings_t& rb_settings, + int num_tasks, + omp_mutex_t* var_mutex_down, + omp_mutex_t* var_mutex_up, + pcgenerator_t* rng, + cuopt::work_limit_context_t* work_ctx = nullptr, + const sb_update_callback_t& on_sb_update = {}); + template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, @@ -556,6 +606,7 @@ void strong_branching(const lp_problem_t& original_lp, const std::vector& basic_list, const std::vector& nonbasic_list, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc); + pseudo_costs_t& pc, + cuopt::work_limit_context_t* work_unit_context = nullptr); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp index 9c56ada50e..69ac7e43df 100644 --- a/cpp/src/dual_simplex/basis_updates.cpp +++ b/cpp/src/dual_simplex/basis_updates.cpp @@ -2202,7 +2202,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect. std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less()); - work_estimate_ += (m + nz) * std::log2(m + nz); + if (nz > 1) { work_estimate_ += (nz)*std::log2((f_t)(nz)); } // Gather the workspace into a column of S i_t S_start; @@ -2214,7 +2214,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde // Gather etilde into a column of S etilde.sort(); // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input? - work_estimate_ += etilde.i.size() * std::log2(etilde.i.size()); + if (etilde.i.size() > 1) { work_estimate_ += etilde.i.size() * std::log2((f_t)etilde.i.size()); } S_.append_column(etilde); work_estimate_ += 4 * etilde.i.size(); diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp index e30b067398..d9abc26fe1 100644 --- a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp +++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp @@ -235,7 +235,7 @@ void bound_flipping_ratio_test_t::heap_passes(const std::vector& // Remove minimum ratio from the heap and rebalance i_t heap_index = bare_idx.front(); std::pop_heap(bare_idx.begin(), bare_idx.end(), compare); - work_estimate_ += 2 * std::log2(bare_idx.size()); + if (bare_idx.size() > 1) { work_estimate_ += 2 * std::log2((f_t)bare_idx.size()); } bare_idx.pop_back(); nonbasic_entering = current_indicies[heap_index]; diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp index 244ff334df..4b62c66771 100644 --- a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp +++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp @@ -100,7 +100,7 @@ class bound_flipping_ratio_test_t { i_t n_; i_t m_; - f_t work_estimate_; + f_t work_estimate_{0.0}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 5b1130796e..0e841fe22f 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -3551,7 +3551,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2_work_estimate += ft.work_estimate(); ft.clear_work_estimate(); - work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8); + if (work_unit_context) { + work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8); + } phase2_work_estimate = 0.0; last_feature_log_iter = iter; diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index cfc120e477..9aea2f1648 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -113,7 +114,7 @@ struct simplex_solver_settings_t { reliability_branching(-1), inside_mip(0), sub_mip(0), - solution_callback(nullptr), + new_incumbent_callback(nullptr), heuristic_preemption_callback(nullptr), dual_simplex_objective_callback(nullptr), concurrent_halt(nullptr) @@ -202,6 +203,8 @@ struct simplex_solver_settings_t { // 0, 1 - Estimate the objective change using a single pivot of dual simplex // >1 - Set as the iteration limit in dual simplex i_t strong_branching_simplex_iteration_limit; + f_t bb_work_unit_scale{1.0}; + bool gpu_heur_wait_for_exploration{true}; diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics @@ -214,7 +217,9 @@ struct simplex_solver_settings_t { i_t inside_mip; // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node i_t sub_mip; // 0 if in regular MIP solve, 1 if in sub-MIP solve - std::function&, f_t)> solution_callback; + std::function&, f_t, const cuopt::internals::mip_solution_callback_info_t&, double)> + new_incumbent_callback; std::function&, f_t)> node_processed_callback; std::function heuristic_preemption_callback; std::function&, std::vector&, f_t)> set_simplex_solution_callback; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index c23b1d27ca..10026eb05e 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -113,6 +113,9 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_HYPER_HEURISTIC_INITIAL_INFEASIBILITY_WEIGHT, &mip_settings.heuristic_params.initial_infeasibility_weight, f_t(1e-9), std::numeric_limits::infinity(), f_t(1000.0), "constraint violation penalty seed"}, {CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT, &mip_settings.heuristic_params.relaxed_lp_time_limit, f_t(1e-9), std::numeric_limits::infinity(), f_t(1.0), "base relaxed LP time cap in heuristics"}, {CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT, &mip_settings.heuristic_params.related_vars_time_limit, f_t(1e-9), std::numeric_limits::infinity(), f_t(30.0), "time for related-variable structure build"}, + {CUOPT_MIP_HYPER_HEURISTIC_CPUFJ_WORK_UNIT_SCALE, &mip_settings.cpufj_work_unit_scale, f_t(0.0), std::numeric_limits::infinity(), f_t(1.0), "user multiplier on CPUFJ work-unit rate"}, + {CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WORK_UNIT_SCALE, &mip_settings.gpu_heur_work_unit_scale, f_t(0.0), std::numeric_limits::infinity(), f_t(1.0), "user multiplier on GPU heuristics work-unit rate"}, + {CUOPT_MIP_HYPER_HEURISTIC_BB_WORK_UNIT_SCALE, &mip_settings.bb_work_unit_scale, f_t(0.0), std::numeric_limits::infinity(), f_t(1.0), "user multiplier on B&B work-unit rate"}, }; // Int parameters @@ -142,7 +145,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits::max(), -1}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, - {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, + {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_DETERMINISM_NONE, CUOPT_DETERMINISM_FULL, CUOPT_DETERMINISM_NONE}, {CUOPT_RANDOM_SEED, &mip_settings.seed, -1, std::numeric_limits::max(), -1}, {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits::max(), -1}, {CUOPT_PDLP_PRECISION, reinterpret_cast(&pdlp_settings.pdlp_precision), CUOPT_PDLP_DEFAULT_PRECISION, CUOPT_PDLP_MIXED_PRECISION, CUOPT_PDLP_DEFAULT_PRECISION}, @@ -171,6 +174,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_ELIMINATE_DENSE_COLUMNS, &pdlp_settings.eliminate_dense_columns, true}, {CUOPT_CUDSS_DETERMINISTIC, &pdlp_settings.cudss_deterministic, false}, {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true}, + {CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WAIT_FOR_EXPLORATION, &mip_settings.gpu_heur_wait_for_exploration, false, "GPU heuristics wait for B&B root solve before starting"}, }; // String parameters string_parameters = { diff --git a/cpp/src/mip_heuristics/diversity/diversity_config.hpp b/cpp/src/mip_heuristics/diversity/diversity_config.hpp index dacf7773de..c27f857ba0 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_config.hpp +++ b/cpp/src/mip_heuristics/diversity/diversity_config.hpp @@ -26,6 +26,10 @@ struct diversity_config_t { double lp_run_time_if_feasible = 2.; double lp_run_time_if_infeasible = 1.; bool halve_population = false; + bool fj_only_run = false; + bool dry_run = false; + bool initial_solution_only = false; + int n_fp_iterations = 1000000; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu index b8dc3d33bf..0065acef76 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu @@ -5,7 +5,6 @@ */ /* clang-format on */ -#include "cuda_profiler_api.h" #include "diversity_manager.cuh" #include @@ -14,12 +13,21 @@ #include #include #include +#include #include +#include #include -#include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif constexpr bool fj_only_run = false; @@ -55,7 +63,7 @@ diversity_manager_t::diversity_manager_t(mip_solver_context_thandle_ptr->get_stream()), ls(context, lp_optimal_solution), rins(context, *this), - timer(diversity_config.default_time_limit), + timer(0.0, cuopt::termination_checker_t::root_tag_t{}), bound_prop_recombiner(context, context.problem_ptr->n_variables, ls.constraint_prop, @@ -79,6 +87,31 @@ diversity_manager_t::diversity_manager_t(mip_solver_context_t::n_of_arms, cuopt::seed_generator::get_seed(), ls_alpha, "ls"), ls_hash_map(*context.problem_ptr) { + // Necessary for tests that run sequentially - static globals aren't otherwise reset + fp_recombiner_config_t::max_n_of_vars_from_other = + fp_recombiner_config_t::initial_n_of_vars_from_other; + ls_recombiner_config_t::max_n_of_vars_from_other = + ls_recombiner_config_t::initial_n_of_vars_from_other; + bp_recombiner_config_t::max_n_of_vars_from_other = + bp_recombiner_config_t::initial_n_of_vars_from_other; + sub_mip_recombiner_config_t::max_n_of_vars_from_other = + sub_mip_recombiner_config_t::initial_n_of_vars_from_other; + mab_ls_config_t::last_lm_config = 0; + mab_ls_config_t::last_ls_mab_option = 0; + + CUOPT_DETERMINISM_LOG( + "Deterministic solve start diversity state: seed_state=%lld fp_max=%zu " + "ls_max=%zu bp_max=%zu sub_mip_max=%zu last_lm=%d last_ls=%d " + "enabled_recombiners=%zu", + (long long)cuopt::seed_generator::peek_seed(), + fp_recombiner_config_t::max_n_of_vars_from_other, + ls_recombiner_config_t::max_n_of_vars_from_other, + bp_recombiner_config_t::max_n_of_vars_from_other, + sub_mip_recombiner_config_t::max_n_of_vars_from_other, + (int)mab_ls_config_t::last_lm_config, + (int)mab_ls_config_t::last_ls_mab_option, + recombiner_t::enabled_recombiners.size()); + int max_config = -1; int env_config_id = -1; const char* env_max_config = std::getenv("CUOPT_MAX_CONFIG"); @@ -106,6 +139,9 @@ diversity_manager_t::diversity_manager_t(mip_solver_context_t @@ -153,7 +189,7 @@ void diversity_manager_t::consume_staged_simplex_solution(lp_state_t bool diversity_manager_t::run_local_search(solution_t& solution, const weight_t& weights, - timer_t& timer, + termination_checker_t& timer, ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("run_local_search"); @@ -174,7 +210,7 @@ void diversity_manager_t::generate_solution(f_t time_limit, bool rando sol.compute_feasibility(); // if a feasible is found, it is added to the population ls.generate_solution(sol, random_start, &population, time_limit); - population.add_solution(std::move(sol)); + population.add_solution(std::move(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); } template @@ -187,7 +223,12 @@ void diversity_manager_t::add_user_given_solutions( rmm::device_uvector init_sol_assignment(*init_sol, sol.handle_ptr->get_stream()); if (problem_ptr->pre_process_assignment(init_sol_assignment)) { relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = std::min(60., timer.remaining_time() / 2); + lp_settings.time_limit = std::min(60., timer.remaining_time() / 2); + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } lp_settings.tolerance = problem_ptr->tolerances.absolute_tolerance; lp_settings.save_state = false; lp_settings.return_first_feasible = true; @@ -206,7 +247,9 @@ void diversity_manager_t::add_user_given_solutions( is_feasible, sol.get_user_objective(), sol.get_total_excess()); - population.run_solution_callbacks(sol); + if (is_feasible) { + population.run_solution_callbacks(sol, internals::mip_solution_origin_t::USER_INITIAL); + } initial_sol_vector.emplace_back(std::move(sol)); } else { CUOPT_LOG_ERROR( @@ -220,11 +263,13 @@ void diversity_manager_t::add_user_given_solutions( } template -bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_timer) +bool diversity_manager_t::run_presolve(f_t time_limit, + cuopt::termination_checker_t& global_timer) { raft::common::nvtx::range fun_scope("run_presolve"); CUOPT_LOG_INFO("Running presolve!"); - timer_t presolve_timer(time_limit); + CUOPT_LOG_INFO("Problem fingerprint before DM presolve: 0x%x", problem_ptr->get_fingerprint()); + termination_checker_t presolve_timer(context.gpu_heur_loop, time_limit, *context.termination); auto term_crit = ls.constraint_prop.bounds_update.solve(*problem_ptr); if (ls.constraint_prop.bounds_update.infeas_constraints_count > 0) { @@ -234,15 +279,17 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ if (termination_criterion_t::NO_UPDATE != term_crit) { ls.constraint_prop.bounds_update.set_updated_bounds(*problem_ptr); } + bool run_probing_cache = !fj_only_run; - // Don't run probing cache in deterministic mode yet as neither B&B nor CPUFJ need it - // and it doesn't make use of work units yet - if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { run_probing_cache = false; } if (run_probing_cache) { // Run probing cache before trivial presolve to discover variable implications - const f_t max_time_on_probing = diversity_config.max_time_on_probing; - f_t time_for_probing_cache = std::min(max_time_on_probing, time_limit); - timer_t probing_timer{time_for_probing_cache}; + const f_t max_time_on_probing = + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) + ? std::numeric_limits::infinity() + : diversity_config.max_time_on_probing; + f_t time_for_probing_cache = std::min(max_time_on_probing, time_limit); + termination_checker_t probing_timer( + context.gpu_heur_loop, time_for_probing_cache, *context.termination); // this function computes probing cache, finds singletons, substitutions and changes the problem bool problem_is_infeasible = compute_probing_cache(ls.constraint_prop.bounds_update, *problem_ptr, probing_timer); @@ -252,8 +299,10 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ problem_ptr->related_vars_time_limit = context.settings.heuristic_params.related_vars_time_limit; if (!global_timer.check_time_limit()) { trivial_presolve(*problem_ptr, remap_cache_ids); } if (!problem_ptr->empty && !check_bounds_sanity(*problem_ptr)) { return false; } - // if (!presolve_timer.check_time_limit() && !context.settings.heuristics_only && - // !problem_ptr->empty) { + const bool run_clique_table = + !presolve_timer.check_time_limit() && !context.settings.heuristics_only && + !problem_ptr->empty && !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + // if (run_clique_table) { // f_t time_limit_for_clique_table = std::min(3., presolve_timer.remaining_time() / 5); // timer_t clique_timer(time_limit_for_clique_table); // dual_simplex::user_problem_t host_problem(problem_ptr->handle_ptr); @@ -292,6 +341,10 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ } stats.presolve_time = presolve_timer.elapsed_time(); lp_optimal_solution.resize(problem_ptr->n_variables, problem_ptr->handle_ptr->get_stream()); + thrust::fill(problem_ptr->handle_ptr->get_thrust_policy(), + lp_optimal_solution.begin(), + lp_optimal_solution.end(), + f_t(0)); lp_dual_optimal_solution.resize(problem_ptr->n_constraints, problem_ptr->handle_ptr->get_stream()); problem_ptr->handle_ptr->sync_stream(); @@ -299,7 +352,9 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ problem_ptr->n_constraints, problem_ptr->n_variables, problem_ptr->presolve_data.objective_offset); - CUOPT_LOG_INFO("cuOpt presolve time: %.2f", stats.presolve_time); + CUOPT_LOG_INFO("cuOpt presolve time: %.2f, fingerprint: 0x%x", + stats.presolve_time, + problem_ptr->get_fingerprint()); return true; } @@ -311,24 +366,25 @@ void diversity_manager_t::generate_quick_feasible_solution() // min 1 second, max 10 seconds const f_t generate_fast_solution_time = std::min(diversity_config.max_fast_sol_time, std::max(1., timer.remaining_time() / 20.)); - timer_t sol_timer(generate_fast_solution_time); + termination_checker_t sol_timer( + context.gpu_heur_loop, generate_fast_solution_time, *context.termination); // do very short LP run to get somewhere close to the optimal point ls.generate_fast_solution(solution, sol_timer); if (solution.get_feasible()) { - population.run_solution_callbacks(solution); initial_sol_vector.emplace_back(std::move(solution)); problem_ptr->handle_ptr->sync_stream(); solution_t searched_sol(initial_sol_vector.back()); ls_config_t ls_config; run_local_search(searched_sol, population.weights, sol_timer, ls_config); - population.run_solution_callbacks(searched_sol); initial_sol_vector.emplace_back(std::move(searched_sol)); auto& feas_sol = initial_sol_vector.back().get_feasible() ? initial_sol_vector.back() : initial_sol_vector[initial_sol_vector.size() - 2]; - CUOPT_LOG_INFO("Generated fast solution in %f seconds with objective %f", + population.run_solution_callbacks(feas_sol, internals::mip_solution_origin_t::LOCAL_SEARCH); + CUOPT_LOG_INFO("Generated fast solution in %f seconds with objective %f, hash 0x%x", timer.elapsed_time(), - feas_sol.get_user_objective()); + feas_sol.get_user_objective(), + feas_sol.get_hash()); } problem_ptr->handle_ptr->sync_stream(); } @@ -366,8 +422,29 @@ void diversity_manager_t::run_fp_alone() { CUOPT_LOG_DEBUG("Running FP alone!"); solution_t sol(population.best_feasible()); - ls.run_fp(sol, timer, &population); - CUOPT_LOG_DEBUG("FP alone finished!"); + CUOPT_DETERMINISM_LOG( + "Deterministic FP alone input: hash=0x%x feasible=%d obj=%.16e excess=%.16e", + sol.get_hash(), + (int)sol.get_feasible(), + sol.get_user_objective(), + sol.get_total_excess()); + ls.run_fp(sol, timer, &population, diversity_config.n_fp_iterations); + CUOPT_DETERMINISM_LOG( + "Deterministic FP alone output: hash=0x%x feasible=%d obj=%.16e excess=%.16e", + sol.get_hash(), + (int)sol.get_feasible(), + sol.get_user_objective(), + sol.get_total_excess()); + if (sol.get_feasible()) { + population.add_solution(std::move(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); + } + auto& best_sol = population.best_feasible(); + CUOPT_DETERMINISM_LOG( + "Deterministic FP alone population best after: hash=0x%x feasible=%d obj=%.16e excess=%.16e", + best_sol.get_hash(), + (int)best_sol.get_feasible(), + best_sol.get_user_objective(), + best_sol.get_total_excess()); } template @@ -384,19 +461,46 @@ solution_t diversity_manager_t::run_solver() raft::common::nvtx::range fun_scope("run_solver"); CUOPT_LOG_DEBUG("Determinism mode: %s", - context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC ? "deterministic" - : "opportunistic"); + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) + ? "deterministic" + : "opportunistic"); // to automatically compute the solving time on scope exit auto timer_raii_guard = cuopt::scope_guard([&]() { stats.total_solve_time = timer.elapsed_time(); }); + auto log_return_solution = [&](const char* reason, solution_t& sol) { + CUOPT_DETERMINISM_LOG( + "Deterministic run_solver return: reason=%s hash=0x%x feasible=%d " + "obj=%.16e excess=%.16e", + reason, + sol.get_hash(), + (int)sol.get_feasible(), + sol.get_user_objective(), + sol.get_total_excess()); + }; + + const bool deterministic_bb_without_deterministic_heuristics = + (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); // Debug: Allow disabling GPU heuristics to test B&B tree determinism in isolation const char* disable_heuristics_env = std::getenv("CUOPT_DISABLE_GPU_HEURISTICS"); - if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { - CUOPT_LOG_INFO("Running deterministic mode with CPUFJ heuristic"); + if (deterministic_bb_without_deterministic_heuristics || + (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1")) { + CUOPT_LOG_INFO("GPU heuristics disabled (det_bb_only=%d env=%s)", + (int)deterministic_bb_without_deterministic_heuristics, + disable_heuristics_env ? disable_heuristics_env : "unset"); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr) { + auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync(); + producer_sync.registration_complete(); + } population.initialize_population(); population.allocate_solutions(); + std::vector> initial_sol_vector; + add_user_given_solutions(initial_sol_vector); + population.add_solutions_from_vec(std::move(initial_sol_vector), + internals::mip_solution_origin_t::USER_INITIAL); // Start CPUFJ in deterministic mode with B&B integration if (context.branch_and_bound_ptr != nullptr) { @@ -412,21 +516,38 @@ solution_t diversity_manager_t::run_solver() ls.stop_cpufj_deterministic(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + log_return_solution("heuristics_disabled", best_sol); + return best_sol; } - if (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1") { - CUOPT_LOG_INFO("GPU heuristics disabled via CUOPT_DISABLE_GPU_HEURISTICS=1"); - population.initialize_population(); - population.allocate_solutions(); - while (!check_b_b_preemption()) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bool gpu_heuristic_producer_registered = false; + auto gpu_heuristic_producer_guard = cuopt::scope_guard([&]() { + if (!gpu_heuristic_producer_registered || context.branch_and_bound_ptr == nullptr) { return; } + auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync(); + producer_sync.deregister_producer(context.gpu_heur_loop.producer_progress_ptr()); + context.gpu_heur_loop.detach_producer_sync(); + }); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr) { + if (context.settings.gpu_heur_wait_for_exploration) { + CUOPT_LOG_INFO("GPU heuristics waiting for B&B tree exploration to start..."); + auto wait_start = std::chrono::high_resolution_clock::now(); + context.branch_and_bound_ptr->wait_for_exploration_start(); + double wait_elapsed = + std::chrono::duration(std::chrono::high_resolution_clock::now() - wait_start) + .count(); + CUOPT_LOG_INFO("GPU heuristics resumed after %.2fs (B&B exploration started)", wait_elapsed); } - return population.best_feasible(); + auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync(); + context.gpu_heur_loop.attach_producer_sync(&producer_sync); + producer_sync.register_producer(context.gpu_heur_loop.producer_progress_ptr()); + producer_sync.registration_complete(); + gpu_heuristic_producer_registered = true; } population.timer = timer; - const f_t time_limit = timer.remaining_time(); + const f_t time_limit = timer.deterministic ? timer.get_time_limit() : timer.remaining_time(); const auto& hp = context.settings.heuristic_params; const f_t lp_time_limit = std::min(hp.root_lp_max_time, time_limit * hp.root_lp_time_ratio); // after every change to the problem, we should resize all the relevant vars @@ -438,7 +559,7 @@ solution_t diversity_manager_t::run_solver() // have the structure ready for reusing later problem_ptr->compute_integer_fixed_problem(); recombiner_t::init_enabled_recombiners( - *problem_ptr, context.settings.heuristic_params.enabled_recombiners); + context, *problem_ptr, context.settings.heuristic_params.enabled_recombiners); mab_recombiner.resize_mab_arm_stats(recombiner_t::enabled_recombiners.size()); // test problem is not ii cuopt_func_call( @@ -448,13 +569,27 @@ solution_t diversity_manager_t::run_solver() "The problem must not be ii"); population.initialize_population(); population.allocate_solutions(); - if (check_b_b_preemption()) { return population.best_feasible(); } + if (check_b_b_preemption()) { + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_after_population_init", best_sol); + return best_sol; + } add_user_given_solutions(initial_sol_vector); + CUOPT_DETERMINISM_LOG("DM bootstrap: initial_sol_vector size after user solutions = %lu", + initial_sol_vector.size()); // Run CPUFJ early to find quick initial solutions ls_cpufj_raii_guard_t ls_cpufj_raii_guard(ls); // RAII to stop cpufj threads on solve stop - ls.start_cpufj_scratch_threads(population); - if (check_b_b_preemption()) { return population.best_feasible(); } + if (!diversity_config.dry_run && + !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + ls.start_cpufj_scratch_threads(population); + } + + if (check_b_b_preemption()) { + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_before_lp", best_sol); + return best_sol; + } lp_state_t& lp_state = problem_ptr->lp_state; // resize because some constructor might be called before the presolve lp_state.resize(*problem_ptr, problem_ptr->handle_ptr->get_stream()); @@ -462,30 +597,62 @@ solution_t diversity_manager_t::run_solver() if (bb_thread_solution_exists) { consume_staged_simplex_solution(lp_state); ls.lp_optimal_exists = true; - } else if (!fj_only_run) { + } else if (!diversity_config.fj_only_run) { convert_greater_to_less(*problem_ptr); f_t absolute_tolerance = context.settings.tolerances.absolute_tolerance; - pdlp_solver_settings_t pdlp_settings{}; - pdlp_settings.tolerances.absolute_dual_tolerance = absolute_tolerance; - pdlp_settings.tolerances.relative_dual_tolerance = - context.settings.tolerances.relative_tolerance; - pdlp_settings.tolerances.absolute_primal_tolerance = absolute_tolerance; - pdlp_settings.tolerances.relative_primal_tolerance = - context.settings.tolerances.relative_tolerance; - pdlp_settings.time_limit = lp_time_limit; - pdlp_settings.first_primal_feasible = false; - pdlp_settings.concurrent_halt = &global_concurrent_halt; - pdlp_settings.method = method_t::Concurrent; - pdlp_settings.inside_mip = true; - pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; - pdlp_settings.num_gpus = context.settings.num_gpus; - pdlp_settings.presolver = presolver_t::None; - pdlp_settings.per_constraint_residual = true; - set_pdlp_solver_mode(pdlp_settings); - timer_t lp_timer(lp_time_limit); - auto lp_result = solve_lp_with_method(*problem_ptr, pdlp_settings, lp_timer); + auto lp_result = [&]() { + // no concurrent root solve in determinism mode, reuse the work-accounted relaxed_lp machinery + // for this + if (timer.deterministic) { + relaxed_lp_settings_t lp_settings{}; + lp_settings.time_limit = lp_time_limit; + lp_settings.work_limit = lp_time_limit; + lp_settings.tolerance = absolute_tolerance; + lp_settings.check_infeasibility = true; + lp_settings.return_first_feasible = false; + lp_settings.save_state = true; + lp_settings.per_constraint_residual = true; + lp_settings.has_initial_primal = false; + lp_settings.concurrent_halt = &global_concurrent_halt; + lp_settings.work_context = &context.gpu_heur_loop; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + CUOPT_DETERMINISM_LOG( + "DM root LP config: dry_run=%d deterministic=%d work_limit=%.6f time_limit=%.6f", + (int)diversity_config.dry_run, + (int)timer.deterministic, + lp_settings.work_limit, + lp_settings.time_limit); + return get_relaxed_lp_solution( + *problem_ptr, lp_optimal_solution, lp_state, lp_settings); + } + pdlp_solver_settings_t pdlp_settings{}; + pdlp_settings.tolerances.absolute_dual_tolerance = absolute_tolerance; + pdlp_settings.tolerances.relative_dual_tolerance = + context.settings.tolerances.relative_tolerance; + pdlp_settings.tolerances.absolute_primal_tolerance = absolute_tolerance; + pdlp_settings.tolerances.relative_primal_tolerance = + context.settings.tolerances.relative_tolerance; + pdlp_settings.time_limit = lp_time_limit; + pdlp_settings.first_primal_feasible = false; + pdlp_settings.concurrent_halt = &global_concurrent_halt; + pdlp_settings.method = method_t::Concurrent; + pdlp_settings.inside_mip = true; + pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; + pdlp_settings.num_gpus = context.settings.num_gpus; + pdlp_settings.presolver = presolver_t::None; + pdlp_settings.per_constraint_residual = true; + set_pdlp_solver_mode(pdlp_settings); + timer_t lp_timer(lp_time_limit); + return solve_lp_with_method(*problem_ptr, pdlp_settings, lp_timer); + }(); + CUOPT_DETERMINISM_LOG( + "DM root LP result: status=%d iters=%d user_obj=%.12f primal_hash=0x%x", + (int)lp_result.get_termination_status(), + lp_result.get_additional_termination_information().number_of_steps_taken, + lp_result.get_objective_value(), + detail::compute_hash(lp_result.get_primal_solution(), problem_ptr->handle_ptr->get_stream())); bool use_staged_simplex_solution = false; { @@ -527,9 +694,11 @@ solution_t diversity_manager_t::run_solver() } else if (lp_result.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { CUOPT_LOG_ERROR("PDLP detected dual infeasibility, continuing anyway!"); ls.lp_optimal_exists = false; - } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit) { + } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit || + lp_result.get_termination_status() == pdlp_termination_status_t::IterationLimit) { CUOPT_LOG_DEBUG( - "Initial LP run exceeded time limit, continuing solver with partial LP result!"); + "Initial LP run exceeded time/iteration limit, continuing solver with partial LP " + "result!"); // note to developer, in debug mode the LP run might be too slow and it might cause PDLP // not to bring variables within the bounds } @@ -573,50 +742,106 @@ solution_t diversity_manager_t::run_solver() if (!use_staged_simplex_solution) { // in case the pdlp returned var boudns that are out of bounds clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr); + CUOPT_DETERMINISM_LOG( + "DM root LP post-clamp: lp_optimal_solution hash=0x%x", + detail::compute_hash(lp_optimal_solution, problem_ptr->handle_ptr->get_stream())); } } if (ls.lp_optimal_exists) { solution_t lp_rounded_sol(*problem_ptr); lp_rounded_sol.copy_new_assignment(lp_optimal_solution); + CUOPT_DETERMINISM_LOG("DM bootstrap candidate (LP raw): hash=0x%x feas=%d obj=%.12f", + lp_rounded_sol.get_hash(), + (int)lp_rounded_sol.get_feasible(), + lp_rounded_sol.get_user_objective()); lp_rounded_sol.round_nearest(); lp_rounded_sol.compute_feasibility(); - population.add_solution(std::move(lp_rounded_sol)); - ls.start_cpufj_lptopt_scratch_threads(population); + CUOPT_DETERMINISM_LOG("DM bootstrap candidate (LP rounded): hash=0x%x feas=%d obj=%.12f", + lp_rounded_sol.get_hash(), + (int)lp_rounded_sol.get_feasible(), + lp_rounded_sol.get_user_objective()); + population.add_solution(std::move(lp_rounded_sol), + internals::mip_solution_origin_t::LP_ROUNDING); + if (!diversity_config.dry_run && + !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + ls.start_cpufj_lptopt_scratch_threads(population); + } } - population.add_solutions_from_vec(std::move(initial_sol_vector)); + for (size_t i = 0; i < initial_sol_vector.size(); ++i) { + CUOPT_DETERMINISM_LOG( + "DM bootstrap candidate (initial_sol_vector[%lu]): hash=0x%x feas=%d obj=%.12f", + i, + initial_sol_vector[i].get_hash(), + (int)initial_sol_vector[i].get_feasible(), + initial_sol_vector[i].get_user_objective()); + } + population.add_solutions_from_vec(std::move(initial_sol_vector), + internals::mip_solution_origin_t::USER_INITIAL); - if (check_b_b_preemption()) { return population.best_feasible(); } + if (check_b_b_preemption()) { + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_after_initial_population", best_sol); + return best_sol; + } if (context.settings.benchmark_info_ptr != nullptr) { context.settings.benchmark_info_ptr->objective_of_initial_population = population.best_feasible().get_user_objective(); } - if (fj_only_run) { + if (diversity_config.dry_run) { + auto& best_sol = population.best_feasible(); + log_return_solution("dry_run", best_sol); + return best_sol; + } + if (diversity_config.fj_only_run) { solution_t sol(*problem_ptr); run_fj_alone(sol); + log_return_solution("fj_only_run", sol); return sol; } - rins.enable(); + // RINS not supported in deterministic mode yet + if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { rins.enable(); } generate_solution(timer.remaining_time(), false); + if (diversity_config.initial_solution_only) { + auto& best_sol = population.best_feasible(); + log_return_solution("initial_solution_only", best_sol); + return best_sol; + } if (timer.check_time_limit()) { rins.stop_rins(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + log_return_solution("work_limit_reached", best_sol); + return best_sol; } if (check_b_b_preemption()) { rins.stop_rins(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_before_fp", best_sol); + return best_sol; } + CUOPT_LOG_DEBUG("pre-run_fp_alone: gpu_work=%g gpu_prod=%g", + context.gpu_heur_loop.current_work(), + context.gpu_heur_loop.current_producer_work()); run_fp_alone(); + CUOPT_LOG_DEBUG("post-run_fp_alone: gpu_work=%g gpu_prod=%g", + context.gpu_heur_loop.current_work(), + context.gpu_heur_loop.current_producer_work()); rins.stop_rins(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + CUOPT_LOG_DEBUG("post-fp handoff: feas=%d obj=%g hash=0x%x", + (int)best_sol.get_feasible(), + best_sol.get_user_objective(), + best_sol.get_hash()); + log_return_solution("post_fp_alone", best_sol); + return best_sol; }; template @@ -641,8 +866,10 @@ void diversity_manager_t::diversity_step(i_t max_iterations_without_im auto [sol1, sol2] = population.get_two_random(tournament); cuopt_assert(population.test_invariant(), ""); auto [lp_offspring, offspring] = recombine_and_local_search(sol1, sol2); - auto [inserted_pos_1, best_updated_1] = population.add_solution(std::move(lp_offspring)); - auto [inserted_pos_2, best_updated_2] = population.add_solution(std::move(offspring)); + auto [inserted_pos_1, best_updated_1] = population.add_solution( + std::move(lp_offspring), internals::mip_solution_origin_t::RECOMBINATION); + auto [inserted_pos_2, best_updated_2] = population.add_solution( + std::move(offspring), internals::mip_solution_origin_t::RECOMBINATION); if (best_updated_1 || best_updated_2) { recombine_stats.add_best_updated(); } cuopt_assert(population.test_invariant(), ""); if ((inserted_pos_1 != -1 && inserted_pos_1 <= 2) || @@ -684,10 +911,12 @@ void diversity_manager_t::recombine_and_ls_with_all(solution_t::recombine_and_ls_with_all(solution_t void diversity_manager_t::recombine_and_ls_with_all( - std::vector>& solutions, bool add_only_feasible) + std::vector::drained_external_solution_t>& solutions, + bool add_only_feasible) { raft::common::nvtx::range fun_scope("recombine_and_ls_with_all"); if (solutions.size() > 0) { CUOPT_LOG_DEBUG("Running recombiners on B&B solutions with size %lu", solutions.size()); // add all solutions because time limit might have been consumed and we might have exited before - for (auto& sol : solutions) { + for (auto& drained_sol : solutions) { + auto& sol = drained_sol.solution; cuopt_func_call(sol.test_feasibility(true)); - population.add_solution(std::move(solution_t(sol))); + population.add_solution(std::move(solution_t(sol)), drained_sol.origin); } - for (auto& sol : solutions) { + for (auto& drained_sol : solutions) { + auto& sol = drained_sol.solution; if (timer.check_time_limit()) { return; } solution_t ls_solution(sol); ls_config_t ls_config; @@ -759,6 +991,7 @@ diversity_manager_t::recombine_and_local_search(solution_t& sol1.get_feasible(), sol2.get_quality(population.weights), sol2.get_feasible()); + bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); double best_objective_of_parents = std::min(sol1.get_objective(), sol2.get_objective()); bool at_least_one_parent_feasible = sol1.get_feasible() || sol2.get_feasible(); // randomly choose among 3 recombiners @@ -769,7 +1002,7 @@ diversity_manager_t::recombine_and_local_search(solution_t& std::numeric_limits::lowest(), std::numeric_limits::lowest(), std::numeric_limits::max(), - recombiner_work_normalized_reward_t(0.0)); + recombiner_work_normalized_reward_t(deterministic, 0.0)); return std::make_pair(solution_t(sol1), solution_t(sol2)); } cuopt_assert(population.test_invariant(), ""); @@ -789,7 +1022,7 @@ diversity_manager_t::recombine_and_local_search(solution_t& std::numeric_limits::lowest(), std::numeric_limits::lowest(), std::numeric_limits::max(), - recombiner_work_normalized_reward_t(0.0)); + recombiner_work_normalized_reward_t(deterministic, 0.0)); return std::make_pair(solution_t(sol1), solution_t(sol2)); } cuopt_assert(offspring.test_number_all_integer(), "All must be integers after LS"); @@ -807,7 +1040,12 @@ diversity_manager_t::recombine_and_local_search(solution_t& : diversity_config.lp_run_time_if_infeasible; lp_run_time = std::min(lp_run_time, timer.remaining_time()); relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = lp_run_time; + lp_settings.time_limit = lp_run_time; + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } lp_settings.tolerance = context.settings.tolerances.absolute_tolerance; lp_settings.return_first_feasible = false; lp_settings.save_state = true; @@ -828,12 +1066,15 @@ diversity_manager_t::recombine_and_local_search(solution_t& offspring_qual, sol1.get_quality(population.weights), sol2.get_quality(population.weights)); f_t best_quality_of_parents = std::min(sol1.get_quality(population.weights), sol2.get_quality(population.weights)); - mab_recombiner.add_mab_reward( - mab_recombiner.last_chosen_option, - best_quality_of_parents, - population.best().get_quality(population.weights), - offspring_qual, - recombiner_work_normalized_reward_t(recombine_stats.get_last_recombiner_time())); + mab_recombiner.add_mab_reward(mab_recombiner.last_chosen_option, + best_quality_of_parents, + population.best().get_quality(population.weights), + offspring_qual, + !deterministic + ? recombiner_work_normalized_reward_t( + deterministic, recombine_stats.get_last_recombiner_time()) + : recombiner_work_normalized_reward_t( + deterministic, recombine_stats.get_last_recombiner_work())); mab_ls.add_mab_reward(mab_ls_config_t::last_ls_mab_option, best_quality_of_parents, population.best_feasible().get_quality(population.weights), @@ -878,31 +1119,50 @@ std::pair, bool> diversity_manager_t::recombine( } } } + CUOPT_DETERMINISM_LOG( + "Deterministic recombiner selection: requested=%s selected_index=%d chosen=%s " + "enabled_size=%zu last_choice_before=%d current_seed=%d", + recombiner_t::recombiner_name(recombiner_type), + (int)selected_index, + recombiner_t::recombiner_name(recombiner), + recombiner_t::enabled_recombiners.size(), + mab_recombiner.last_chosen_option, + (unsigned int)cuopt::seed_generator::get_seed()); mab_recombiner.set_last_chosen_option(selected_index); recombine_stats.add_attempt((recombiner_enum_t)recombiner); recombine_stats.start_recombiner_time(); + CUOPT_DETERMINISM_LOG("Recombining sol %x and %x with recombiner %d, weights %x", + a.get_hash(), + b.get_hash(), + recombiner, + population.weights.get_hash()); + // Refactored code using a switch statement switch (recombiner) { case recombiner_enum_t::BOUND_PROP: { - auto [sol, success] = bound_prop_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = bound_prop_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); } case recombiner_enum_t::FP: { - auto [sol, success] = fp_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = fp_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); } case recombiner_enum_t::LINE_SEGMENT: { - auto [sol, success] = line_segment_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = line_segment_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); } case recombiner_enum_t::SUB_MIP: { - auto [sol, success] = sub_mip_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = sub_mip_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh index 863933de48..0def707221 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,7 @@ template class diversity_manager_t { public: diversity_manager_t(mip_solver_context_t& context); - bool run_presolve(f_t time_limit, timer_t global_timer); + bool run_presolve(f_t time_limit, cuopt::termination_checker_t& global_timer); solution_t run_solver(); void generate_solution(f_t time_limit, bool random_start = true); void run_fj_alone(solution_t& solution); @@ -50,8 +51,9 @@ class diversity_manager_t { void diversity_step(i_t max_iterations_without_improvement); void add_user_given_solutions(std::vector>& initial_sol_vector); population_t* get_population_pointer() { return &population; } - void recombine_and_ls_with_all(std::vector>& solutions, - bool add_only_feasible = false); + void recombine_and_ls_with_all( + std::vector::drained_external_solution_t>& solutions, + bool add_only_feasible = false); void recombine_and_ls_with_all(solution_t& solution, bool add_only_feasible = false); std::pair, solution_t> recombine_and_local_search( solution_t& a, @@ -65,7 +67,7 @@ class diversity_manager_t { solution_t& sol2); bool run_local_search(solution_t& solution, const weight_t& weights, - timer_t& timer, + termination_checker_t& timer, ls_config_t& ls_config); void consume_staged_simplex_solution(lp_state_t& lp_state); @@ -84,7 +86,7 @@ class diversity_manager_t { std::vector staged_simplex_dual_solution; f_t staged_simplex_objective{std::numeric_limits::infinity()}; local_search_t ls; - cuopt::timer_t timer; + cuopt::termination_checker_t timer; bound_prop_recombiner_t bound_prop_recombiner; fp_recombiner_t fp_recombiner; line_segment_recombiner_t line_segment_recombiner; diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index c4331343de..2247dfcef4 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -271,10 +271,11 @@ void rins_t::run_rins() branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200; branch_and_bound_settings.log.log = false; branch_and_bound_settings.log.log_prefix = "[RINS] "; - branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector& solution, - f_t objective) { - rins_solution_queue.push_back(solution); - }; + branch_and_bound_settings.new_incumbent_callback = + [&rins_solution_queue](std::vector& solution, + f_t objective, + const cuopt::internals::mip_solution_callback_info_t&, + double) { rins_solution_queue.push_back(solution); }; dual_simplex::probing_implied_bound_t empty_probing(branch_and_bound_problem.num_cols); dual_simplex::branch_and_bound_t branch_and_bound( branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic(), empty_probing); @@ -347,8 +348,9 @@ void rins_t::run_rins() cuopt_assert(best_sol.assignment.size() == sol_size_before_rins, "Assignment size mismatch"); cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables, "Assignment size mismatch"); - dm.population.add_external_solution( - best_sol.get_host_assignment(), best_sol.get_objective(), solution_origin_t::RINS); + dm.population.add_external_solution(best_sol.get_host_assignment(), + best_sol.get_objective(), + internals::mip_solution_origin_t::RINS); } } diff --git a/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh b/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh index 4571d0d57f..b9219b8dcb 100644 --- a/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh +++ b/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh @@ -45,16 +45,22 @@ struct ls_work_normalized_reward_t { }; struct recombiner_work_normalized_reward_t { - double time_in_miliseconds; - recombiner_work_normalized_reward_t(double time_in_miliseconds) - : time_in_miliseconds(time_in_miliseconds) + bool deterministic; + double work; + recombiner_work_normalized_reward_t(bool deterministic, double work) + : deterministic(deterministic), work(work) { } double operator()(double factor) const { // normal recombiners take 2000 ms - return factor * (std::max(0.1, 4.0 - (time_in_miliseconds / 2000))); + if (!deterministic) { + double time_in_miliseconds = work; + return factor * (std::max(0.1, 4.0 - (time_in_miliseconds / 2000))); + } else { + return factor * (std::max(0.1, 4.0 - (work / 200))); + } } }; diff --git a/cpp/src/mip_heuristics/diversity/population.cu b/cpp/src/mip_heuristics/diversity/population.cu index bb0fdd6d11..d922e2fa7f 100644 --- a/cpp/src/mip_heuristics/diversity/population.cu +++ b/cpp/src/mip_heuristics/diversity/population.cu @@ -8,15 +8,27 @@ #include "diversity_manager.cuh" #include "population.cuh" +#include + #include #include #include #include #include +#include #include #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { constexpr double weight_increase_ratio = 2.; @@ -44,7 +56,7 @@ population_t::population_t(std::string const& name_, rng(cuopt::seed_generator::get_seed()), early_exit_primal_generation(false), population_hash_map(*problem_ptr), - timer(0) + timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { best_feasible_objective = std::numeric_limits::max(); } @@ -125,11 +137,12 @@ std::pair, solution_t> population_t::ge } template -void population_t::add_solutions_from_vec(std::vector>&& solutions) +void population_t::add_solutions_from_vec( + std::vector>&& solutions, internals::mip_solution_origin_t callback_origin) { raft::common::nvtx::range fun_scope("add_solution_from_vec"); for (auto&& sol : solutions) { - add_solution(std::move(sol)); + add_solution(std::move(sol), callback_origin); } } @@ -143,11 +156,11 @@ size_t population_t::get_external_solution_size() template void population_t::add_external_solution(const std::vector& solution, f_t objective, - solution_origin_t origin) + internals::mip_solution_origin_t origin) { std::lock_guard lock(solution_mutex); - if (origin == solution_origin_t::CPUFJ) { + if (origin == internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP) { external_solution_queue_cpufj.emplace_back(solution, objective, origin); } else { external_solution_queue.emplace_back(solution, objective, origin); @@ -165,7 +178,7 @@ void population_t::add_external_solution(const std::vector& solut } CUOPT_LOG_DEBUG("%s added a solution to population, solution queue size %lu with objective %g", - solution_origin_to_string(origin), + internals::mip_solution_origin_to_string(origin), external_solution_queue.size(), problem_ptr->get_user_obj_from_solver_obj(objective)); if (objective < best_feasible_objective) { @@ -179,9 +192,17 @@ void population_t::add_external_solution(const std::vector& solut template void population_t::add_external_solutions_to_population() { + const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr; + // Keep producer-only behavior only when deterministic B&B is draining the queue instead. + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) && deterministic_bb) { + return; + } // don't do early exit checks here. mutex needs to be acquired to prevent race conditions auto new_sol_vector = get_external_solutions(); - add_solutions_from_vec(std::move(new_sol_vector)); + for (auto& drained_sol : new_sol_vector) { + add_solution(std::move(drained_sol.solution), drained_sol.origin); + } } // normally we would need a lock here but these are boolean types and race conditions are not @@ -194,10 +215,11 @@ void population_t::preempt_heuristic_solver() } template -std::vector> population_t::get_external_solutions() +std::vector::drained_external_solution_t> +population_t::get_external_solutions() { std::lock_guard lock(solution_mutex); - std::vector> return_vector; + std::vector return_vector; i_t counter = 0; f_t new_best_feasible_objective = best_feasible_objective; f_t longest_wait_time = 0; @@ -205,11 +227,11 @@ std::vector> population_t::get_external_solutions for (auto& h_entry : queue) { // ignore CPUFJ solutions if they're not better than the best feasible. // It seems they worsen results on some instances despite the potential for improved diversity - if (h_entry.origin == solution_origin_t::CPUFJ && + if (h_entry.origin == internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP && h_entry.objective > new_best_feasible_objective) { continue; - } else if (h_entry.origin != solution_origin_t::CPUFJ && - h_entry.objective > new_best_feasible_objective) { + } else if (h_entry.origin != internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP && + h_entry.objective < new_best_feasible_objective) { new_best_feasible_objective = h_entry.objective; } @@ -233,7 +255,7 @@ std::vector> population_t::get_external_solutions problem_ptr->n_integer_vars); } sol.handle_ptr->sync_stream(); - return_vector.emplace_back(std::move(sol)); + return_vector.emplace_back(std::move(sol), h_entry.origin); counter++; } } @@ -258,114 +280,53 @@ bool population_t::is_better_than_best_feasible(solution_t& } template -void population_t::invoke_get_solution_callback( - solution_t& sol, internals::get_solution_callback_t* callback) +void population_t::run_solution_callbacks( + solution_t& sol, internals::mip_solution_origin_t callback_origin) { - f_t user_objective = sol.get_user_objective(); - f_t user_bound = context.stats.get_solution_bound(); - solution_t temp_sol(sol); - problem_ptr->post_process_assignment(temp_sol.assignment); - if (problem_ptr->has_papilo_presolve_data()) { - problem_ptr->papilo_uncrush_assignment(temp_sol.assignment); - } - - std::vector user_objective_vec(1); - std::vector user_bound_vec(1); - std::vector user_assignment_vec(temp_sol.assignment.size()); - user_objective_vec[0] = user_objective; - user_bound_vec[0] = user_bound; - raft::copy(user_assignment_vec.data(), - temp_sol.assignment.data(), - temp_sol.assignment.size(), - temp_sol.handle_ptr->get_stream()); - temp_sol.handle_ptr->sync_stream(); - callback->get_solution(user_assignment_vec.data(), - user_objective_vec.data(), - user_bound_vec.data(), - callback->get_user_data()); -} + if (is_better_than_best_feasible(sol)) { + const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr; -template -void population_t::run_solution_callbacks(solution_t& sol) -{ - bool better_solution_found = is_better_than_best_feasible(sol); - auto user_callbacks = context.settings.get_mip_callbacks(); - if (better_solution_found) { - if (context.settings.benchmark_info_ptr != nullptr) { - context.settings.benchmark_info_ptr->last_improvement_of_best_feasible = timer.elapsed_time(); - } - CUOPT_LOG_DEBUG("Population: Found new best solution %g", sol.get_user_objective()); - if (problem_ptr->branch_and_bound_callback != nullptr) { - problem_ptr->branch_and_bound_callback(sol.get_host_assignment()); - } - for (auto callback : user_callbacks) { - if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto get_sol_callback = static_cast(callback); - invoke_get_solution_callback(sol, get_sol_callback); + if (deterministic_bb) { + const double work_timestamp = context.gpu_heur_loop.current_producer_work(); + cuopt_assert(std::isfinite(work_timestamp), + "Deterministic heuristic work timestamp must be finite"); + context.branch_and_bound_ptr->queue_external_solution_deterministic( + sol.get_host_assignment(), sol.get_user_objective(), work_timestamp, callback_origin); + } else { + if (context.branch_and_bound_ptr != nullptr && + context.problem_ptr->branch_and_bound_callback != nullptr) { + context.problem_ptr->branch_and_bound_callback(sol.get_host_assignment(), callback_origin); } + + const double work_timestamp = context.gpu_heur_loop.current_work(); + const auto payload = context.solution_publication.build_callback_payload( + context.problem_ptr, sol, callback_origin, work_timestamp); + context.solution_publication.publish_new_best_feasible(payload, timer.elapsed_time()); } // Save the best objective here even if callback handling later exits early. // This prevents older solutions from being reported as "new best" in subsequent callbacks. best_feasible_objective = sol.get_objective(); } - for (auto callback : user_callbacks) { - if (callback->get_type() == internals::base_solution_callback_type::SET_SOLUTION) { - auto set_sol_callback = static_cast(callback); - f_t user_bound = context.stats.get_solution_bound(); - auto callback_num_variables = problem_ptr->original_problem_ptr->get_n_variables(); - rmm::device_uvector incumbent_assignment(callback_num_variables, - sol.handle_ptr->get_stream()); - solution_t outside_sol(sol); - rmm::device_scalar d_outside_sol_objective(sol.handle_ptr->get_stream()); - auto inf = std::numeric_limits::infinity(); - d_outside_sol_objective.set_value_async(inf, sol.handle_ptr->get_stream()); - sol.handle_ptr->sync_stream(); - std::vector h_incumbent_assignment(incumbent_assignment.size()); - std::vector h_outside_sol_objective(1, inf); - std::vector h_user_bound(1, user_bound); - set_sol_callback->set_solution(h_incumbent_assignment.data(), - h_outside_sol_objective.data(), - h_user_bound.data(), - set_sol_callback->get_user_data()); - f_t outside_sol_objective = h_outside_sol_objective[0]; - // The callback might be called without setting any valid solution or objective which triggers - // asserts - if (outside_sol_objective == inf) { return; } - d_outside_sol_objective.set_value_async(outside_sol_objective, sol.handle_ptr->get_stream()); - raft::copy(incumbent_assignment.data(), - h_incumbent_assignment.data(), - incumbent_assignment.size(), - sol.handle_ptr->get_stream()); - - bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment); - if (!is_valid) { return; } - cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(), - "Incumbent assignment size mismatch"); - raft::copy(outside_sol.assignment.data(), - incumbent_assignment.data(), - incumbent_assignment.size(), - sol.handle_ptr->get_stream()); - outside_sol.compute_feasibility(); - - CUOPT_LOG_DEBUG("Injected solution feasibility = %d objective = %g excess = %g", - outside_sol.get_feasible(), - outside_sol.get_user_objective(), - outside_sol.get_total_excess()); - if (std::abs(outside_sol.get_user_objective() - outside_sol_objective) > 1e-6) { - cuopt_func_call( - CUOPT_LOG_DEBUG("External solution objective mismatch: outside_sol.get_user_objective() " - "= %g, outside_sol_objective = %g", - outside_sol.get_user_objective(), - outside_sol_objective)); + context.solution_injection.invoke_set_solution_callbacks( + problem_ptr, + sol, + [this]( + const std::vector& assignment, f_t objective, internals::mip_solution_origin_t origin) { + const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr; + if (deterministic_bb) { + const double work_timestamp = context.gpu_heur_loop.current_producer_work(); + context.branch_and_bound_ptr->queue_external_solution_deterministic( + assignment, + context.problem_ptr->get_user_obj_from_solver_obj(objective), + work_timestamp, + origin); + } else { + add_external_solution(assignment, objective, origin); } - cuopt_assert(std::abs(outside_sol.get_user_objective() - outside_sol_objective) <= 1e-6, - "External solution objective mismatch"); - auto h_outside_sol = outside_sol.get_host_assignment(); - add_external_solution( - h_outside_sol, outside_sol.get_objective(), solution_origin_t::EXTERNAL); - } - } + }); } template @@ -401,7 +362,8 @@ void population_t::adjust_weights_according_to_best_feasible() } template -std::pair population_t::add_solution(solution_t&& sol) +std::pair population_t::add_solution( + solution_t&& sol, internals::mip_solution_origin_t callback_origin) { std::lock_guard lock(write_mutex); raft::common::nvtx::range fun_scope("add_solution"); @@ -411,16 +373,18 @@ std::pair population_t::add_solution(solution_t&& // for hash computation, quality calculation, and similarity comparisons. sol.handle_ptr->sync_stream(); population_hash_map.insert(sol); - double sol_cost = sol.get_quality(weights); - bool best_updated = false; - CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d!", + double sol_cost = sol.get_quality(weights); + bool best_updated = false; + const uint32_t candidate_hash = sol.get_hash(); + CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d, hash %x!", sol_cost, sol.get_user_objective(), - sol.n_assigned_integers); + sol.n_assigned_integers, + candidate_hash); // We store the best feasible found so far at index 0. if (sol.get_feasible() && (solutions[0].first == false || sol_cost + OBJECTIVE_EPSILON < indices[0].second)) { - run_solution_callbacks(sol); + run_solution_callbacks(sol, callback_origin); solutions[0].first = true; // we only have move assignment operator solution_t temp_sol(sol); @@ -706,7 +670,7 @@ void population_t::halve_the_population() clear_except_best_feasible(); var_threshold = std::max(var_threshold * 0.97, 0.5 * problem_ptr->n_integer_vars); for (auto& sol : sol_vec) { - add_solution(solution_t(sol)); + add_solution(solution_t(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); } if (counter++ > max_adjustments) break; } @@ -718,7 +682,7 @@ void population_t::halve_the_population() max_var_threshold, std::min((size_t)(var_threshold * 1.02), (size_t)(0.995 * problem_ptr->n_integer_vars))); for (auto& sol : sol_vec) { - add_solution(solution_t(sol)); + add_solution(solution_t(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); } if (counter++ > max_adjustments) break; } @@ -744,7 +708,7 @@ void population_t::start_threshold_adjustment() } template -void population_t::adjust_threshold(cuopt::timer_t timer) +void population_t::adjust_threshold(cuopt::termination_checker_t& timer) { double time_ratio = (timer.elapsed_time() - population_start_time) / (timer.get_time_limit() - population_start_time); @@ -833,23 +797,29 @@ bool population_t::test_invariant() template void population_t::print() { + std::vector hashes; + for (auto& index : indices) + hashes.push_back(solutions[index.first].second.get_hash()); + uint32_t final_hash = compute_hash(hashes); CUOPT_LOG_DEBUG(" -------------- "); - CUOPT_LOG_DEBUG("%s infeas weight %f threshold %d/%d:", + CUOPT_LOG_DEBUG("%s infeas weight %f threshold %d/%d (hash %x):", name.c_str(), infeasibility_importance, var_threshold, - problem_ptr->n_integer_vars); + problem_ptr->n_integer_vars, + final_hash); i_t i = 0; for (auto& index : indices) { if (index.first == 0 && solutions[0].first) { CUOPT_LOG_DEBUG(" Best feasible: %f", solutions[index.first].second.get_user_objective()); } - CUOPT_LOG_DEBUG("%d : %f\t%f\t%f\t%d", + CUOPT_LOG_DEBUG("%d : %f\t%f\t%f\t%d (hash %x)", i, index.second, solutions[index.first].second.get_total_excess(), solutions[index.first].second.get_user_objective(), - solutions[index.first].second.get_feasible()); + solutions[index.first].second.get_feasible(), + solutions[index.first].second.get_hash()); i++; } CUOPT_LOG_DEBUG(" -------------- "); @@ -858,8 +828,8 @@ void population_t::print() template void population_t::run_all_recombiners(solution_t& sol) { - std::vector> sol_vec; - sol_vec.emplace_back(std::move(solution_t(sol))); + std::vector::drained_external_solution_t> sol_vec; + sol_vec.emplace_back(solution_t(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); dm.recombine_and_ls_with_all(sol_vec, true); } diff --git a/cpp/src/mip_heuristics/diversity/population.cuh b/cpp/src/mip_heuristics/diversity/population.cuh index c83a4bfb83..0ef4cbf67f 100644 --- a/cpp/src/mip_heuristics/diversity/population.cuh +++ b/cpp/src/mip_heuristics/diversity/population.cuh @@ -25,22 +25,20 @@ namespace cuopt::linear_programming::detail { template class diversity_manager_t; -enum class solution_origin_t { BRANCH_AND_BOUND, CPUFJ, RINS, EXTERNAL }; - -constexpr const char* solution_origin_to_string(solution_origin_t origin) -{ - switch (origin) { - case solution_origin_t::BRANCH_AND_BOUND: return "B&B"; - case solution_origin_t::CPUFJ: return "CPUFJ"; - case solution_origin_t::RINS: return "RINS"; - case solution_origin_t::EXTERNAL: return "injected"; - default: return "unknown"; - } -} - template class population_t { public: + struct drained_external_solution_t { + drained_external_solution_t(solution_t&& solution_, + internals::mip_solution_origin_t origin_) + : solution(std::move(solution_)), origin(origin_) + { + } + + solution_t solution; + internals::mip_solution_origin_t origin; + }; + population_t(std::string const& name, mip_solver_context_t& context, diversity_manager_t& dm, @@ -83,6 +81,7 @@ class population_t { a.first = false; indices[0].second = std::numeric_limits::max(); indices.erase(indices.begin() + 1, indices.end()); + best_feasible_objective = std::numeric_limits::max(); } void clear_except_best_feasible() @@ -92,6 +91,7 @@ class population_t { } solutions[indices[0].first].first = true; indices.erase(indices.begin() + 1, indices.end()); + best_feasible_objective = solutions[indices[0].first].second.get_objective(); } // ------------------- @@ -103,16 +103,18 @@ class population_t { /*! \brief { Add a solution to population. Similar solutions may be ejected from the pool. } * \return { -1 = not inserted , others = inserted index} */ - std::pair add_solution(solution_t&& sol); + std::pair add_solution(solution_t&& sol, + internals::mip_solution_origin_t callback_origin); void add_external_solution(const std::vector& solution, f_t objective, - solution_origin_t origin); - std::vector> get_external_solutions(); + internals::mip_solution_origin_t origin); + std::vector get_external_solutions(); void add_external_solutions_to_population(); size_t get_external_solution_size(); void preempt_heuristic_solver(); - void add_solutions_from_vec(std::vector>&& solutions); + void add_solutions_from_vec(std::vector>&& solutions, + internals::mip_solution_origin_t callback_origin); // Updates the cstr weights according to the best solutions feasibility void compute_new_weights(); @@ -122,7 +124,7 @@ class population_t { // updates qualities of each solution void update_qualities(); // adjusts the threshold of the population - void adjust_threshold(cuopt::timer_t timer); + void adjust_threshold(cuopt::termination_checker_t& timer); /*! \param sol { Input solution } * \return { Index of the best solution similar to sol. If no similar is found we return * max_solutions. }*/ @@ -153,7 +155,8 @@ class population_t { std::vector> population_to_vector(); void halve_the_population(); - void run_solution_callbacks(solution_t& sol); + void run_solution_callbacks(solution_t& sol, + internals::mip_solution_origin_t callback_origin); void adjust_weights_according_to_best_feasible(); @@ -161,9 +164,6 @@ class population_t { void diversity_step(i_t max_iterations_without_improvement); - void invoke_get_solution_callback(solution_t& sol, - internals::get_solution_callback_t* callback); - // does some consistency tests bool test_invariant(); @@ -186,7 +186,9 @@ class population_t { struct external_solution_t { external_solution_t() = default; - external_solution_t(const std::vector& solution, f_t objective, solution_origin_t origin) + external_solution_t(const std::vector& solution, + f_t objective, + internals::mip_solution_origin_t origin) : solution(solution), objective(objective), origin(origin), @@ -195,7 +197,7 @@ class population_t { } std::vector solution; f_t objective; - solution_origin_t origin; + internals::mip_solution_origin_t origin; timer_t timer; // debug timer to track how long a solution has lingered in the queue }; @@ -211,7 +213,7 @@ class population_t { // be seeded from an early-FJ incumbent objective before a matching population solution exists. f_t best_feasible_objective = std::numeric_limits::max(); assignment_hash_map_t population_hash_map; - cuopt::timer_t timer; + cuopt::termination_checker_t timer; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh index 9d6bb3902c..61b9c9e59a 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh @@ -29,6 +29,7 @@ class bound_prop_recombiner_t : public recombiner_t { rng(cuopt::seed_generator::get_seed()), vars_to_fix(n_vars, handle_ptr->get_stream()) { + thrust::fill(handle_ptr->get_thrust_policy(), vars_to_fix.begin(), vars_to_fix.end(), -1); } void get_probing_values_for_infeasible( @@ -131,9 +132,9 @@ class bound_prop_recombiner_t : public recombiner_t { }); } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("bound_prop_recombiner"); auto& guiding_solution = a.get_feasible() ? a : b; @@ -148,10 +149,11 @@ class bound_prop_recombiner_t : public recombiner_t { i_t n_vars_from_other = n_different_vars; i_t fixed_from_guiding = 0; i_t fixed_from_other = 0; + i_t seed = cuopt::seed_generator::get_seed(); if (n_different_vars > (i_t)bp_recombiner_config_t::max_n_of_vars_from_other) { fixed_from_guiding = n_vars_from_other - bp_recombiner_config_t::max_n_of_vars_from_other; n_vars_from_other = bp_recombiner_config_t::max_n_of_vars_from_other; - thrust::default_random_engine g{(unsigned int)cuopt::seed_generator::get_seed()}; + thrust::default_random_engine g{(unsigned int)seed}; thrust::shuffle(a.handle_ptr->get_thrust_policy(), this->remaining_indices.data(), this->remaining_indices.data() + n_different_vars, @@ -160,12 +162,35 @@ class bound_prop_recombiner_t : public recombiner_t { i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other; CUOPT_LOG_DEBUG( "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other); + + CUOPT_DETERMINISM_LOG("BP_DET: sol_a_hash=0x%x sol_b_hash=0x%x offspring_hash=0x%x, seed %x", + a.get_hash(), + b.get_hash(), + offspring.get_hash(), + seed); + CUOPT_DETERMINISM_LOG("BP_DET: n_different_vars=%d n_vars_from_other=%d n_vars_from_guiding=%d", + n_different_vars, + n_vars_from_other, + n_vars_from_guiding); + CUOPT_DETERMINISM_LOG( + "BP_DET: remaining_indices_hash=0x%x (first %d elements)", + detail::compute_hash(make_span(this->remaining_indices), a.handle_ptr->get_stream()), + std::min((i_t)10, n_vars_from_other)); + CUOPT_DETERMINISM_LOG("BP_DET: guiding_feasible=%d other_feasible=%d expensive_to_fix=%d", + guiding_solution.get_feasible(), + other_solution.get_feasible(), + a.problem_ptr->expensive_to_fix_vars); + CUOPT_DETERMINISM_LOG( + "BP_DET: fixed_from_guiding=%d fixed_from_other=%d", fixed_from_guiding, fixed_from_other); + // if either all integers are from A(meaning all are common) or all integers are from B(meaning // all are different), return if (n_vars_from_guiding == 0 || n_vars_from_other == 0) { CUOPT_LOG_DEBUG("Returning false because all vars are common or different"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } + // TODO: REPLACE! + double work = static_cast(n_vars_from_other) / 1e8; cuopt_assert(a.problem_ptr == b.problem_ptr, "The two solutions should not refer to different problems"); @@ -175,9 +200,16 @@ class bound_prop_recombiner_t : public recombiner_t { a.handle_ptr->get_stream()); probing_config_t probing_config(a.problem_ptr->n_variables, a.handle_ptr); if (guiding_solution.get_feasible() && !a.problem_ptr->expensive_to_fix_vars) { + CUOPT_DETERMINISM_LOG("BP_DET: Taking FEASIBLE path (with variable fixing)"); this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding); + CUOPT_DETERMINISM_LOG("BP_DET: vars_to_fix_size=%lu", vars_to_fix.size()); auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix); - timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit); + CUOPT_DETERMINISM_LOG("BP_DET: fixed_problem_fingerprint=0x%x variable_map_size=%lu", + fixed_problem.get_fingerprint(), + variable_map.size()); + termination_checker_t timer(this->context.gpu_heur_loop, + bp_recombiner_config_t::bounds_prop_time_limit, + *this->context.termination); rmm::device_uvector old_assignment(offspring.assignment, offspring.handle_ptr->get_stream()); offspring.handle_ptr->sync_stream(); @@ -197,26 +229,44 @@ class bound_prop_recombiner_t : public recombiner_t { constraint_prop.single_rounding_only = true; constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config); constraint_prop.single_rounding_only = false; - cuopt_func_call(bool feasible_after_bounds_prop = offspring.get_feasible()); + offspring.compute_feasibility(); + bool feasible_after_bounds_prop = offspring.get_feasible(); offspring.handle_ptr->sync_stream(); offspring.problem_ptr = a.problem_ptr; fixed_assignment = std::move(offspring.assignment); offspring.assignment = std::move(old_assignment); offspring.handle_ptr->sync_stream(); offspring.unfix_variables(fixed_assignment, variable_map); - cuopt_func_call(bool feasible_after_unfix = offspring.get_feasible()); - // May be triggered due to numerical issues - // TODO: investigate further - // cuopt_assert(feasible_after_unfix == feasible_after_bounds_prop, - // "Feasible after unfix should be same as feasible after bounds prop!"); + offspring.compute_feasibility(); + bool feasible_after_unfix = offspring.get_feasible(); + cuopt_func_call(f_t excess_after_unfix = offspring.get_total_excess()); + if (feasible_after_unfix != feasible_after_bounds_prop) { + CUOPT_LOG_WARN("Numerical issue in bounds prop, infeasibility after unfix"); + // might become infeasible after unfixing due to numerical issues. Check that the excess + // remains consistent + // CUOPT_LOG_ERROR("Excess: %g, %g, %g, %g, feas %d", offspring.get_total_excess(), + // offspring.compute_max_constraint_violation(), offspring.compute_max_int_violation(), + // offspring.compute_max_variable_violation(), feasible_after_unfix); + // cuopt_assert(fabs(excess_after_unfix - excess_before) < 1e-6, + // "Excess after unfix should be same as before unfix!"); + } a.handle_ptr->sync_stream(); } else { - timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit); + CUOPT_DETERMINISM_LOG("BP_DET: Taking INFEASIBLE path (no variable fixing)"); + termination_checker_t timer(this->context.gpu_heur_loop, + bp_recombiner_config_t::bounds_prop_time_limit, + *this->context.termination); get_probing_values_for_infeasible( guiding_solution, other_solution, offspring, probing_values, n_vars_from_other); probing_config.probing_values = host_copy(probing_values, offspring.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG( + "BP_DET: probing_values_hash=0x%x", + detail::compute_hash(make_span(probing_values), a.handle_ptr->get_stream())); constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config); } + CUOPT_DETERMINISM_LOG("BP_DET: After apply_round: offspring_hash=0x%x feasible=%d", + offspring.get_hash(), + offspring.get_feasible()); constraint_prop.max_n_failed_repair_iterations = 1; cuopt_func_call(offspring.test_number_all_integer()); bool better_cost_than_parents = @@ -236,11 +286,17 @@ class bound_prop_recombiner_t : public recombiner_t { bp_recombiner_config_t::decrease_max_n_of_vars_from_other(); } } + CUOPT_DETERMINISM_LOG( + "BP_DET: Final offspring_hash=0x%x same_as_parents=%d better_cost=%d better_feas=%d", + offspring.get_hash(), + same_as_parents, + better_cost_than_parents, + better_feasibility_than_parents); if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !same_as_parents); + return std::make_tuple(offspring, !same_as_parents, work); } rmm::device_uvector vars_to_fix; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh index 1cca1ba371..6edc1471b0 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh @@ -35,9 +35,9 @@ class fp_recombiner_t : public recombiner_t { { } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("FP recombiner"); auto& guiding_solution = a.get_feasible() ? a : b; @@ -50,6 +50,7 @@ class fp_recombiner_t : public recombiner_t { CUOPT_LOG_DEBUG("FP rec: Number of different variables %d MAX_VARS %d", n_different_vars, fp_recombiner_config_t::max_n_of_vars_from_other); + CUOPT_DETERMINISM_LOG("FP rec: offspring hash 0x%x", offspring.get_hash()); i_t n_vars_from_other = n_different_vars; if (n_vars_from_other > (i_t)fp_recombiner_config_t::max_n_of_vars_from_other) { n_vars_from_other = fp_recombiner_config_t::max_n_of_vars_from_other; @@ -62,17 +63,34 @@ class fp_recombiner_t : public recombiner_t { i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other; if (n_vars_from_other == 0 || n_vars_from_guiding == 0) { CUOPT_LOG_DEBUG("Returning false because all vars are common or different"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } + // TODO: CHANGE + double work = static_cast(n_vars_from_other) / 1e8; CUOPT_LOG_DEBUG( "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other); this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding); + CUOPT_DETERMINISM_LOG( + "FP rec post computevarstofix: offspring hash 0x%x, vars to fix 0x%x", + offspring.get_hash(), + detail::compute_hash(make_span(vars_to_fix), offspring.handle_ptr->get_stream())); auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix); + CUOPT_DETERMINISM_LOG( + "FP rec: fixed_problem hash 0x%x assigned hash 0x%x", + fixed_problem.get_fingerprint(), + detail::compute_hash(make_span(fixed_assignment), offspring.handle_ptr->get_stream())); fixed_problem.check_problem_representation(true); if (!guiding_solution.get_feasible() && !other_solution.get_feasible()) { + CUOPT_DETERMINISM_LOG("FP rec: running LP with infeasibility detection"); relaxed_lp_settings_t lp_settings; lp_settings.time_limit = fp_recombiner_config_t::infeasibility_detection_time_limit; - lp_settings.tolerance = fixed_problem.tolerances.absolute_tolerance; + if (this->context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) { + lp_settings.time_limit = std::numeric_limits::max(); + lp_settings.work_limit = fp_recombiner_config_t::infeasibility_detection_time_limit; + lp_settings.work_context = &this->context.gpu_heur_loop; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } + lp_settings.tolerance = fixed_problem.tolerances.absolute_tolerance; lp_settings.return_first_feasible = true; lp_settings.save_state = true; lp_settings.check_infeasibility = true; @@ -83,7 +101,7 @@ class fp_recombiner_t : public recombiner_t { lp_response.get_termination_status() == pdlp_termination_status_t::DualInfeasible || lp_response.get_termination_status() == pdlp_termination_status_t::TimeLimit) { CUOPT_LOG_DEBUG("FP recombiner failed because LP found infeasible!"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } } // brute force rounding threshold is 8 @@ -96,7 +114,16 @@ class fp_recombiner_t : public recombiner_t { offspring.handle_ptr->sync_stream(); offspring.assignment = std::move(fixed_assignment); cuopt_func_call(offspring.test_variable_bounds(false)); - timer_t timer(fp_recombiner_config_t::fp_time_limit); + CUOPT_DETERMINISM_LOG( + "FP rec pre-descent: offspring_hash=0x%x fixed_assignment_hash=0x%x " + "problem_fingerprint=0x%x fixed_n_integer_vars=%d", + offspring.get_hash(), + detail::compute_hash(offspring.assignment, offspring.handle_ptr->get_stream()), + fixed_problem.get_fingerprint(), + fixed_problem.n_integer_vars); + termination_checker_t timer(this->context.gpu_heur_loop, + fp_recombiner_config_t::fp_time_limit, + *this->context.termination); fp.timer = timer; fp.cycle_queue.reset(offspring); fp.reset(); @@ -134,9 +161,9 @@ class fp_recombiner_t : public recombiner_t { !guiding_solution.get_feasible(); if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !same_as_parents); + return std::make_tuple(offspring, !same_as_parents, work); } rmm::device_uvector vars_to_fix; // keep a copy of FP to prevent interference with generation FP diff --git a/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh index d413af86cd..72482934de 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh @@ -66,22 +66,26 @@ class line_segment_recombiner_t : public recombiner_t { return delta_vector; } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("line_segment_recombiner"); + CUOPT_DETERMINISM_LOG("LS rec: a %d b %d", a.get_hash(), b.get_hash()); auto& guiding_solution = a.get_feasible() ? a : b; auto& other_solution = a.get_feasible() ? b : a; // copy the solution from A solution_t offspring(guiding_solution); - timer_t line_segment_timer{ls_recombiner_config_t::time_limit}; + termination_checker_t line_segment_timer{ + this->context.gpu_heur_loop, ls_recombiner_config_t::time_limit, *this->context.termination}; // TODO after we have the conic combination, detect the lambda change // (i.e. the integral variables flip on line segment) i_t n_points_to_search = ls_recombiner_config_t::n_points_to_search; const bool is_feasibility_run = false; i_t n_different_vars = this->assign_same_integer_values(guiding_solution, other_solution, offspring); + // TODO: CHANGE + double work = static_cast(n_different_vars) / 1e8; rmm::device_uvector delta_vector = generate_delta_vector( guiding_solution, other_solution, offspring, n_points_to_search, n_different_vars); line_segment_search.fj.copy_weights(weights, offspring.handle_ptr); @@ -117,9 +121,9 @@ class line_segment_recombiner_t : public recombiner_t { } if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !same_as_parents); + return std::make_tuple(offspring, !same_as_parents, work); } line_segment_search_t& line_segment_search; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh index 4782e9612b..82f2dfb8e8 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,18 @@ __global__ void assign_same_variables_kernel(typename solution_t::view template class recombiner_t { public: + static const char* recombiner_name(recombiner_enum_t recombiner) + { + switch (recombiner) { + case recombiner_enum_t::BOUND_PROP: return "BOUND_PROP"; + case recombiner_enum_t::FP: return "FP"; + case recombiner_enum_t::LINE_SEGMENT: return "LINE_SEGMENT"; + case recombiner_enum_t::SUB_MIP: return "SUB_MIP"; + case recombiner_enum_t::SIZE: return "SIZE"; + } + return "UNKNOWN"; + } + recombiner_t(mip_solver_context_t& context_, i_t n_integer_vars, const raft::handle_t* handle_ptr) @@ -92,6 +105,15 @@ class recombiner_t { cuopt::make_span(remaining_indices), n_remaining.data()); i_t remaining_variables = this->n_remaining.value(a.handle_ptr->get_stream()); + // Sort the indices to resolve nondeterministic order due to atomicAdd + thrust::sort(a.handle_ptr->get_thrust_policy(), + this->remaining_indices.data(), + this->remaining_indices.data() + remaining_variables); + + CUOPT_DETERMINISM_LOG( + "remaining indices hash 0x%x, size %d", + detail::compute_hash(make_span(this->remaining_indices), a.handle_ptr->get_stream()), + remaining_variables); auto vec_remaining_indices = host_copy(this->remaining_indices.data(), remaining_variables, a.handle_ptr->get_stream()); @@ -173,6 +195,12 @@ class recombiner_t { i_t n_vars_from_guiding) { vars_to_fix.resize(n_vars_from_guiding, offspring.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG( + "remaining indices hash 0x%x", + detail::compute_hash(make_span(this->remaining_indices), offspring.handle_ptr->get_stream())); + CUOPT_DETERMINISM_LOG("integer_indices hash 0x%x", + detail::compute_hash(make_span(offspring.problem_ptr->integer_indices), + offspring.handle_ptr->get_stream())); // set difference needs two sorted arrays thrust::sort(offspring.handle_ptr->get_thrust_policy(), this->remaining_indices.data(), @@ -195,27 +223,50 @@ class recombiner_t { "vars_to_fix should be sorted!"); } - static void init_enabled_recombiners(const problem_t& problem, + static void init_enabled_recombiners(mip_solver_context_t& context, + const problem_t& problem, int user_enabled_mask = -1) { std::unordered_set enabled_recombiners; + const bool disable_fp_and_submip_for_expensive_fix = problem.expensive_to_fix_vars; + const i_t n_continuous_vars = problem.n_variables - problem.n_integer_vars; + const bool disable_submip_for_continuous_limit = + n_continuous_vars > (i_t)sub_mip_recombiner_config_t::max_continuous_vars; for (auto recombiner : recombiner_types) { if (user_enabled_mask >= 0 && !(user_enabled_mask & (1 << (uint32_t)recombiner))) { continue; } enabled_recombiners.insert(recombiner); } - if (problem.expensive_to_fix_vars) { + if (disable_fp_and_submip_for_expensive_fix) { enabled_recombiners.erase(recombiner_enum_t::FP); enabled_recombiners.erase(recombiner_enum_t::SUB_MIP); } // check the size of the continous vars - if (problem.n_variables - problem.n_integer_vars > - (i_t)sub_mip_recombiner_config_t::max_continuous_vars) { + if (disable_submip_for_continuous_limit) { enabled_recombiners.erase(recombiner_enum_t::SUB_MIP); } recombiner_t::enabled_recombiners = std::vector(enabled_recombiners.begin(), enabled_recombiners.end()); + cuopt_assert(!recombiner_t::enabled_recombiners.empty(), "No recombiners enabled after init"); + std::string order_str; + for (size_t i = 0; i < recombiner_t::enabled_recombiners.size(); ++i) { + if (i > 0) { order_str += ','; } + order_str += recombiner_name(recombiner_t::enabled_recombiners[i]); + } + CUOPT_DETERMINISM_LOG( + "Deterministic recombiner init: expensive_to_fix=%d n_continuous=%d " + "max_continuous=%zu disable_fp_submip_expensive=%d " + "disable_submip_continuous=%d disable_submip_deterministic=%d size=%zu " + "order=[%s]", + (int)problem.expensive_to_fix_vars, + (int)n_continuous_vars, + sub_mip_recombiner_config_t::max_continuous_vars, + (int)disable_fp_and_submip_for_expensive_fix, + (int)disable_submip_for_continuous_limit, + (int)disable_submip_for_determinism, + recombiner_t::enabled_recombiners.size(), + order_str.c_str()); } mip_solver_context_t& context; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp b/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp index 044e313284..6cd2767f81 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp +++ b/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp @@ -75,8 +75,13 @@ struct all_recombine_stats { // enum of the last attempted recombiner std::optional last_attempt; - double last_recombiner_time; + double last_recombiner_time{0.0}; std::chrono::high_resolution_clock::time_point last_recombiner_start_time; + double last_recombiner_work{0.0}; + + void set_recombiner_work(double work) { last_recombiner_work = work; } + + double get_last_recombiner_work() { return last_recombiner_work; } void start_recombiner_time() { diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh index 5a637aae8e..89b8e06079 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh @@ -38,9 +38,9 @@ class sub_mip_recombiner_t : public recombiner_t { solution_vector.push_back(solution); } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("Sub-MIP recombiner"); solution_vector.clear(); @@ -66,8 +66,10 @@ class sub_mip_recombiner_t : public recombiner_t { i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other; if (n_vars_from_other == 0 || n_vars_from_guiding == 0) { CUOPT_LOG_DEBUG("Returning false because all vars are common or different"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } + // TODO: CHANGE + double work = static_cast(n_vars_from_other) / 1e8; CUOPT_LOG_DEBUG( "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other); this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding); @@ -102,6 +104,10 @@ class sub_mip_recombiner_t : public recombiner_t { branch_and_bound_solution.resize(branch_and_bound_problem.num_cols); // Fill in the settings for branch and bound branch_and_bound_settings.time_limit = sub_mip_recombiner_config_t::sub_mip_time_limit; + if (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) { + branch_and_bound_settings.deterministic = true; + branch_and_bound_settings.work_limit = sub_mip_recombiner_config_t::sub_mip_time_limit; + } branch_and_bound_settings.print_presolve_stats = false; branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap; @@ -112,10 +118,11 @@ class sub_mip_recombiner_t : public recombiner_t { branch_and_bound_settings.clique_cuts = 0; branch_and_bound_settings.sub_mip = 1; branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200; - branch_and_bound_settings.solution_callback = [this](std::vector& solution, - f_t objective) { - this->solution_callback(solution, objective); - }; + branch_and_bound_settings.new_incumbent_callback = + [this](std::vector& solution, + f_t objective, + const cuopt::internals::mip_solution_callback_info_t&, + double) { this->solution_callback(solution, objective); }; // disable B&B logs, so that it is not interfering with the main B&B thread branch_and_bound_settings.log.log = false; @@ -124,6 +131,10 @@ class sub_mip_recombiner_t : public recombiner_t { dual_simplex::branch_and_bound_t branch_and_bound( branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic(), empty_probing); branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution); + if (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) { + double sub_mip_work = branch_and_bound.get_work_unit_context().current_work(); + context.gpu_heur_loop.record_work_sync_on_horizon(sub_mip_work); + } if (solution_vector.size() > 0) { cuopt_assert(fixed_assignment.size() == branch_and_bound_solution.x.size(), "Assignment size mismatch"); @@ -185,7 +196,7 @@ class sub_mip_recombiner_t : public recombiner_t { sol.clamp_within_bounds(); // Scaling might bring some very slight variable bound violations sol.compute_feasibility(); cuopt_func_call(sol.test_variable_bounds()); - population.add_solution(std::move(sol)); + population.add_solution(std::move(sol), internals::mip_solution_origin_t::SUB_MIP); } bool better_cost_than_parents = offspring.get_quality(weights) < @@ -195,9 +206,9 @@ class sub_mip_recombiner_t : public recombiner_t { !guiding_solution.get_feasible(); if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !std::isnan(branch_and_bound_solution.objective)); + return std::make_tuple(offspring, !std::isnan(branch_and_bound_solution.objective), work); } rmm::device_uvector vars_to_fix; mip_solver_context_t& context; diff --git a/cpp/src/mip_heuristics/diversity/weights.cuh b/cpp/src/mip_heuristics/diversity/weights.cuh index 7502ae9210..fbe72aba8e 100644 --- a/cpp/src/mip_heuristics/diversity/weights.cuh +++ b/cpp/src/mip_heuristics/diversity/weights.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -12,6 +12,8 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template @@ -25,6 +27,11 @@ struct weight_t { objective_weight.set_value_async(one, handle_ptr->get_stream()); } + uint32_t get_hash(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const + { + return compute_hash(cstr_weights, stream) ^ compute_hash(objective_weight.value(stream)); + } + rmm::device_uvector cstr_weights; rmm::device_scalar objective_weight; }; diff --git a/cpp/src/mip_heuristics/early_heuristic.cuh b/cpp/src/mip_heuristics/early_heuristic.cuh index 090cfd4901..a1c73ae217 100644 --- a/cpp/src/mip_heuristics/early_heuristic.cuh +++ b/cpp/src/mip_heuristics/early_heuristic.cuh @@ -24,8 +24,10 @@ namespace cuopt::linear_programming::detail { template -using early_incumbent_callback_t = std::function& assignment, const char* heuristic_name)>; +using early_incumbent_callback_t = std::function& assignment, + internals::mip_solution_origin_t origin)>; // CRTP base for early heuristics that run on the original (or papilo-presolved) problem // during presolve to find incumbents as early as possible. @@ -92,7 +94,7 @@ class early_heuristic_t { // Log and callback are deferred to the shared incumbent_callback_ which enforces // global monotonicity across all early heuristic instances. if (incumbent_callback_) { - incumbent_callback_(solver_obj, user_obj, user_assignment, Derived::name()); + incumbent_callback_(solver_obj, user_obj, user_assignment, Derived::origin()); } } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh index 911e846551..89bdff1092 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh @@ -27,6 +27,10 @@ class early_cpufj_t : public early_heuristic_t ~early_cpufj_t(); static constexpr const char* name() { return "CPUFJ"; } + static constexpr internals::mip_solution_origin_t origin() + { + return internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP; + } void start(); void stop(); diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index 3f77427d87..59ad7ed0fd 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -52,10 +52,10 @@ void early_gpufj_t::start() fj_ptr_ = std::make_unique>(*context_ptr_, fj_settings); - fj_ptr_->improvement_callback = [this](f_t user_obj, const std::vector& h_assignment) { + fj_ptr_->set_improvement_callback([this](f_t user_obj, const std::vector& h_assignment) { f_t solver_obj = this->problem_ptr_->get_solver_obj_from_user_obj(user_obj); this->try_update_best(solver_obj, h_assignment); - }; + }); worker_thread_ = std::make_unique(&early_gpufj_t::run_worker, this); } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh index 4a7769143e..f09fc011d5 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh @@ -30,6 +30,10 @@ class early_gpufj_t : public early_heuristic_t ~early_gpufj_t(); static constexpr const char* name() { return "GPUFJ"; } + static constexpr internals::mip_solution_origin_t origin() + { + return internals::mip_solution_origin_t::FEASIBILITY_JUMP; + } void start(); void stop(); diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu index 748dd41dfb..77de354cfb 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,7 @@ #include #include +#include #include #include #include @@ -63,7 +65,8 @@ fj_t::fj_t(mip_solver_context_t& context_, fj_settings_t in_ work_id_to_nonbin_var_idx(pb_ptr->coefficients.size(), pb_ptr->handle_ptr->get_stream()), row_size_bin_prefix_sum(pb_ptr->binary_indices.size(), pb_ptr->handle_ptr->get_stream()), row_size_nonbin_prefix_sum(pb_ptr->nonbinary_indices.size(), pb_ptr->handle_ptr->get_stream()), - work_ids_for_related_vars(pb_ptr->n_variables, pb_ptr->handle_ptr->get_stream()) + work_ids_for_related_vars(pb_ptr->n_variables, pb_ptr->handle_ptr->get_stream()), + deterministic_frontier_work_by_var_d_(0, pb_ptr->handle_ptr->get_stream()) { setval_launch_dims = get_launch_dims_max_occupancy( (void*)update_assignment_kernel, TPB_setval, pb_ptr->handle_ptr); @@ -111,6 +114,158 @@ void fj_t::reset_cuda_graph() graph_created = false; } +template +bool fj_t::use_load_balancing_codepath() const +{ + bool use_load_balancing = false; + if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_OFF) { + use_load_balancing = false; + } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_ON) { + use_load_balancing = true; + } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::AUTO) { + use_load_balancing = + pb_ptr->n_variables > settings.parameters.load_balancing_codepath_min_varcount; + } + if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; } + return use_load_balancing; +} + +// precompute estimates of the amount of work performed per selected variable +// using the related_variables table to estimate the nnz touched +// will be replaced with a model estimator in the future. +template +void fj_t::initialize_deterministic_work_estimator() +{ + const i_t num_vars = pb_ptr->n_variables; + const i_t num_cstrs = pb_ptr->n_constraints; + const double total_nnz = static_cast(pb_ptr->coefficients.size()); + + deterministic_refresh_work_ = total_nnz; + deterministic_average_frontier_work_ = total_nnz; + if (num_vars == 0) { return; } + + auto stream = handle_ptr->get_stream(); + auto policy = handle_ptr->get_thrust_policy(); + + // degree[v] = number of constraints variable v appears in + rmm::device_uvector degree(num_vars, stream); + auto rev_offsets = make_span(pb_ptr->reverse_offsets); + thrust::tabulate(policy, degree.begin(), degree.end(), [rev_offsets] __device__(i_t v) -> double { + return (double)(rev_offsets[v + 1] - rev_offsets[v]); + }); + + deterministic_frontier_work_by_var_d_.resize(num_vars, stream); + + if (pb_ptr->related_variables_offsets.size() > 0 && pb_ptr->related_variables.size() > 0) { + // Exact path: segmented reduce over the precomputed related_variables table + auto degree_ptr = degree.data(); + auto related_offsets = pb_ptr->related_variables_offsets.data(); + auto degree_of_related = thrust::make_transform_iterator( + pb_ptr->related_variables.begin(), [degree_ptr, num_vars] __device__(i_t rv) -> double { + return (rv >= 0 && rv < num_vars) ? degree_ptr[rv] : 0.0; + }); + + size_t temp_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_bytes, + degree_of_related, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + related_offsets, + related_offsets + 1, + stream); + rmm::device_uvector temp(temp_bytes, stream); + cub::DeviceSegmentedReduce::Sum(temp.data(), + temp_bytes, + degree_of_related, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + related_offsets, + related_offsets + 1, + stream); + + } else { + // SpMV path: frontier_work ≈ A^T * (A * degree) + // Overestimates by double-counting shared neighbors, but deterministic and + // load-balanced. Acceptable for a work-unit proxy. + + // Step 1: y[c] = sum of degree[v] for v in constraint c + rmm::device_uvector y(num_cstrs, stream); + auto degree_ptr = degree.data(); + auto offsets_ptr = pb_ptr->offsets.data(); + auto degree_of_var = thrust::make_transform_iterator( + pb_ptr->variables.begin(), + [degree_ptr] __device__(i_t v) -> double { return degree_ptr[v]; }); + + size_t temp_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_bytes, + degree_of_var, + y.data(), + num_cstrs, + offsets_ptr, + offsets_ptr + 1, + stream); + rmm::device_uvector temp(temp_bytes, stream); + cub::DeviceSegmentedReduce::Sum(temp.data(), + temp_bytes, + degree_of_var, + y.data(), + num_cstrs, + offsets_ptr, + offsets_ptr + 1, + stream); + + // Step 2: frontier_work[v] = sum of y[c] for c in constraints_of(v) + auto rev_offs_ptr = pb_ptr->reverse_offsets.data(); + auto y_ptr = y.data(); + auto y_of_constraint = + thrust::make_transform_iterator(pb_ptr->reverse_constraints.begin(), + [y_ptr] __device__(i_t c) -> double { return y_ptr[c]; }); + + temp_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_bytes, + y_of_constraint, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + rev_offs_ptr, + rev_offs_ptr + 1, + stream); + temp.resize(temp_bytes, stream); + cub::DeviceSegmentedReduce::Sum(temp.data(), + temp_bytes, + y_of_constraint, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + rev_offs_ptr, + rev_offs_ptr + 1, + stream); + } + + deterministic_average_frontier_work_ = + thrust::reduce(policy, + deterministic_frontier_work_by_var_d_.begin(), + deterministic_frontier_work_by_var_d_.end(), + 0.0, + thrust::plus()) / + (double)num_vars; + deterministic_frontier_work_by_var_.resize(num_vars); + raft::copy(deterministic_frontier_work_by_var_.data(), + deterministic_frontier_work_by_var_d_.data(), + num_vars, + stream); + + CUOPT_LOG_DEBUG( + "FJ determ: initialized frontier work estimator avg_frontier_nnz=%.6f refresh_nnz=%.6f " + "vars=%zu nnz=%zu load_balancing=%d", + deterministic_average_frontier_work_, + deterministic_refresh_work_, + num_vars, + pb_ptr->coefficients.size(), + (int)use_load_balancing_codepath()); +} + template fj_t::~fj_t() { @@ -189,38 +344,43 @@ fj_t::climber_data_t::view_t fj_t::climber_data_t::view() v.jump_candidates = make_span(jump_candidates); v.jump_candidate_count = make_span(jump_candidate_count); v.jump_locks = make_span(jump_locks); - v.candidate_arrived_workids = make_span(candidate_arrived_workids); - v.grid_score_buf = make_span(grid_score_buf); - v.grid_delta_buf = make_span(grid_delta_buf); - v.grid_var_buf = make_span(grid_var_buf); - v.row_size_bin_prefix_sum = make_span(fj.row_size_bin_prefix_sum); - v.row_size_nonbin_prefix_sum = make_span(fj.row_size_nonbin_prefix_sum); - v.work_id_to_bin_var_idx = make_span(fj.work_id_to_bin_var_idx); - v.work_id_to_nonbin_var_idx = make_span(fj.work_id_to_nonbin_var_idx); - v.work_ids_for_related_vars = make_span(fj.work_ids_for_related_vars); - v.fractional_variables = fractional_variables.view(); - v.saved_best_fractional_count = saved_best_fractional_count.data(); - v.handle_fractionals_only = handle_fractionals_only.data(); - v.selected_var = selected_var.data(); - v.violation_score = violation_score.data(); - v.weighted_violation_score = weighted_violation_score.data(); - v.constraints_changed_count = constraints_changed_count.data(); - v.local_minimums_reached = local_minimums_reached.data(); - v.iterations = iterations.data(); - v.best_excess = best_excess.data(); - v.best_objective = best_objective.data(); - v.saved_solution_objective = saved_solution_objective.data(); - v.incumbent_quality = incumbent_quality.data(); - v.incumbent_objective = incumbent_objective.data(); - v.weight_update_increment = fj.weight_update_increment; - v.objective_weight = fj.objective_weight.data(); - v.last_minimum_iteration = last_minimum_iteration.data(); - v.last_improving_minimum = last_improving_minimum.data(); - v.last_iter_candidates = last_iter_candidates.data(); - v.relvar_count_last_update = relvar_count_last_update.data(); - v.load_balancing_skip = load_balancing_skip.data(); - v.break_condition = break_condition.data(); - v.temp_break_condition = temp_break_condition.data(); + v.candidate_arrived_workids = make_span(candidate_arrived_workids); + v.grid_score_buf = make_span(grid_score_buf); + v.grid_delta_buf = make_span(grid_delta_buf); + v.grid_var_buf = make_span(grid_var_buf); + v.row_size_bin_prefix_sum = make_span(fj.row_size_bin_prefix_sum); + v.row_size_nonbin_prefix_sum = make_span(fj.row_size_nonbin_prefix_sum); + v.work_id_to_bin_var_idx = make_span(fj.work_id_to_bin_var_idx); + v.work_id_to_nonbin_var_idx = make_span(fj.work_id_to_nonbin_var_idx); + v.work_ids_for_related_vars = make_span(fj.work_ids_for_related_vars); + v.deterministic_frontier_work_by_var = make_span(fj.deterministic_frontier_work_by_var_d_); + v.fractional_variables = fractional_variables.view(); + v.saved_best_fractional_count = saved_best_fractional_count.data(); + v.handle_fractionals_only = handle_fractionals_only.data(); + v.selected_var = selected_var.data(); + v.violation_score = violation_score.data(); + v.weighted_violation_score = weighted_violation_score.data(); + v.constraints_changed_count = constraints_changed_count.data(); + v.local_minimums_reached = local_minimums_reached.data(); + v.iterations = iterations.data(); + v.best_excess = best_excess.data(); + v.best_objective = best_objective.data(); + v.saved_solution_objective = saved_solution_objective.data(); + v.incumbent_quality = incumbent_quality.data(); + v.incumbent_objective = incumbent_objective.data(); + v.weight_update_increment = fj.weight_update_increment; + v.objective_weight = fj.objective_weight.data(); + v.last_minimum_iteration = last_minimum_iteration.data(); + v.last_improving_minimum = last_improving_minimum.data(); + v.last_iter_candidates = last_iter_candidates.data(); + v.relvar_count_last_update = relvar_count_last_update.data(); + v.load_balancing_skip = load_balancing_skip.data(); + v.break_condition = break_condition.data(); + v.temp_break_condition = temp_break_condition.data(); + v.deterministic_batch_work = deterministic_batch_work.data(); + v.deterministic_refresh_work = fj.deterministic_refresh_work_; + v.deterministic_work_accounting = + (fj.context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); v.best_jump_idx = best_jump_idx.data(); v.small_move_tabu = small_move_tabu.data(); v.stop_threshold = fj.stop_threshold; @@ -432,9 +592,7 @@ void fj_t::climber_init(i_t climber_idx, const rmm::cuda_stream_view& f_t inf = std::numeric_limits::infinity(); climber->best_objective.set_value_async(inf, climber_stream); climber->saved_solution_objective.set_value_async(inf, climber_stream); - climber->violation_score.set_value_to_zero_async(climber_stream); - climber->weighted_violation_score.set_value_to_zero_async(climber_stream); - init_lhs_and_violation<<<256, 256, 0, climber_stream.value()>>>(view); + refresh_lhs_and_violation(climber_stream); // initialize the best_objective values according to the initial assignment f_t best_obj = compute_objective_from_vec( @@ -458,6 +616,7 @@ void fj_t::climber_init(i_t climber_idx, const rmm::cuda_stream_view& climber->last_iter_candidates.set_value_to_zero_async(climber_stream); climber->relvar_count_last_update.set_value_to_zero_async(climber_stream); climber->load_balancing_skip.set_value_to_zero_async(climber_stream); + climber->deterministic_batch_work.set_value_to_zero_async(climber_stream); climber->constraints_changed_count.set_value_to_zero_async(climber_stream); climber->iterations.set_value_to_zero_async(climber_stream); climber->full_refresh_iteration.set_value_to_zero_async(climber_stream); @@ -650,10 +809,10 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream auto [grid_setval, blocks_setval] = setval_launch_dims; auto [grid_update_changed_constraints, blocks_update_changed_constraints] = update_changed_constraints_launch_dims; - auto [grid_resetmoves, blocks_resetmoves] = resetmoves_launch_dims; - auto [grid_resetmoves_bin, blocks_resetmoves_bin] = resetmoves_bin_launch_dims; - auto [grid_update_weights, blocks_update_weights] = update_weights_launch_dims; - auto [grid_lift_move, blocks_lift_move] = lift_move_launch_dims; + auto [grid_resetmoves, blocks_resetmoves] = resetmoves_launch_dims; + auto [grid_resetmoves_bin, blocks_resetmoves_bin] = resetmoves_bin_launch_dims; + [[maybe_unused]] auto [grid_update_weights, blocks_update_weights] = update_weights_launch_dims; + [[maybe_unused]] auto [grid_lift_move, blocks_lift_move] = lift_move_launch_dims; auto& data = *climbers[climber_idx]; auto v = data.view(); @@ -669,17 +828,10 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream // as it breaks assumptions in the binary_pb codepath if (settings.mode == fj_mode_t::ROUNDING) { is_binary_pb = false; } - bool use_load_balancing = false; - if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_OFF) { - use_load_balancing = false; - } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_ON) { - use_load_balancing = true; - } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::AUTO) { - use_load_balancing = - pb_ptr->n_variables > settings.parameters.load_balancing_codepath_min_varcount; + bool use_load_balancing = use_load_balancing_codepath(); + if (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) { + data.deterministic_batch_work.set_value_to_zero_async(climber_stream); } - // Load-balanced codepath not updated yet to handle rounding mode - if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; } cudaGraph_t graph; void* kernel_args[] = {&v}; @@ -841,9 +993,40 @@ void fj_t::refresh_lhs_and_violation(const rmm::cuda_stream_view& stre auto v = data.view(); data.violated_constraints.clear(stream); - data.violation_score.set_value_to_zero_async(stream); - data.weighted_violation_score.set_value_to_zero_async(stream); - init_lhs_and_violation<<<4096, 256, 0, stream>>>(v); + init_lhs_and_violated_constraints<<<4096, 256, 0, stream>>>(v); + // both transformreduce could be fused; but oh well hardly a bottleneck + auto violation = + thrust::transform_reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(pb_ptr->n_constraints), + cuda::proclaim_return_type([v] __device__(i_t cstr_idx) { + return v.excess_score(cstr_idx, v.incumbent_lhs[cstr_idx]); + }), + (f_t)0, + thrust::plus()); + auto weighted_violation = thrust::transform_reduce( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(pb_ptr->n_constraints), + cuda::proclaim_return_type([v] __device__(i_t cstr_idx) { + return v.excess_score(cstr_idx, v.incumbent_lhs[cstr_idx]) * v.cstr_weights[cstr_idx]; + }), + (f_t)0, + thrust::plus()); + data.violation_score.set_value_async(violation, stream); + data.weighted_violation_score.set_value_async(weighted_violation, stream); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + data.violated_constraints.sort(stream); + } +#if FJ_SINGLE_STEP + CUOPT_LOG_DEBUG("hash assignment %x, hash lhs %x, hash lhscomp %x", + detail::compute_hash(data.incumbent_assignment, stream), + detail::compute_hash(data.incumbent_lhs, stream), + detail::compute_hash(data.incumbent_lhs_sumcomp, stream)); + CUOPT_LOG_DEBUG("Violated constraints hash post sort: %x, index map %x", + detail::compute_hash(data.violated_constraints.contents, stream), + detail::compute_hash(data.violated_constraints.index_map, stream)); +#endif } template @@ -851,6 +1034,10 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) { auto& data = *climbers[climber_idx]; auto v = data.view(); // == climber_views[climber_idx] + const bool deterministic_work_estimate = + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + const bool use_graph = true; + const i_t iterations_per_batch = use_graph ? iterations_per_graph : 1; auto climber_stream = data.stream.view(); if (climber_idx == 0) climber_stream = handle_ptr->get_stream(); @@ -865,12 +1052,13 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) data.incumbent_quality.set_value_async(obj, handle_ptr->get_stream()); - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); i_t steps; bool limit_reached = false; - for (steps = 0; steps < std::numeric_limits::max(); steps += iterations_per_graph) { + for (steps = 0; steps < std::numeric_limits::max(); steps += iterations_per_batch) { // to actualize time limit handle_ptr->sync_stream(); + const bool lhs_refreshed = (steps % settings.parameters.lhs_refresh_period == 0); if (timer.check_time_limit() || steps >= settings.iteration_limit || context.preempt_heuristic_solver_.load()) { limit_reached = true; @@ -879,9 +1067,11 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) // every now and then, ensure external solutions are added to the population // this is done here because FJ is called within FP and also after recombiners // so FJ is one of the most inner and most frequent functions to be called - if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr) { - context.diversity_manager_ptr->get_population_pointer() - ->add_external_solutions_to_population(); + if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr && + context.diversity_manager_ptr != nullptr) { + auto* population_ptr = context.diversity_manager_ptr->get_population_pointer(); + cuopt_assert(population_ptr != nullptr, ""); + population_ptr->add_external_solutions_to_population(); } #if !FJ_SINGLE_STEP @@ -891,7 +1081,7 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) CUOPT_LOG_TRACE( "FJ " "step %d viol %.2g [%d], obj %.8g, best %.8g, mins %d, maxw %g, " - "objw %g", + "objw %g, sol %x, delta %x, inc %x, lhs %x, lhscomp %x, viol %x, weights %x", steps, data.violation_score.value(climber_stream), data.violated_constraints.set_size.value(climber_stream), @@ -899,15 +1089,26 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) data.best_objective.value(climber_stream), data.local_minimums_reached.value(climber_stream), max_cstr_weight.value(climber_stream), - objective_weight.value(climber_stream)); + objective_weight.value(climber_stream), + solution.get_hash(), + detail::compute_hash(data.jump_move_delta, climber_stream), + detail::compute_hash(data.incumbent_assignment, climber_stream), + detail::compute_hash(data.incumbent_lhs, climber_stream), + detail::compute_hash(data.incumbent_lhs_sumcomp, climber_stream), + detail::compute_hash(data.violated_constraints.contents, climber_stream), + detail::compute_hash(cstr_left_weights, climber_stream)); } - if (!limit_reached) { run_step_device(climber_stream, climber_idx); } + if (!limit_reached) { run_step_device(climber_stream, climber_idx, use_graph); } // periodically recompute the LHS and violation scores // to correct any accumulated numerical errors - if (steps % settings.parameters.lhs_refresh_period == 0) { - refresh_lhs_and_violation(climber_stream, climber_idx); + if (lhs_refreshed) { refresh_lhs_and_violation(climber_stream, climber_idx); } + if (deterministic_work_estimate && !limit_reached) { + // TODO: replace with work predictor model + double batch_work = data.deterministic_batch_work.value(climber_stream) / 1e8; + timer.record_work(batch_work); + if (timer.check_time_limit()) { limit_reached = true; } } // periodically synchronize and check the latest solution @@ -985,6 +1186,9 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) solution.get_feasible(), data.local_minimums_reached.value(climber_stream)); + // compute total time spent + double elapsed_time = timer.elapsed_time(); + CUOPT_LOG_TRACE("best fractional count %d", data.saved_best_fractional_count.value(climber_stream)); @@ -1074,7 +1278,11 @@ template i_t fj_t::solve(solution_t& solution) { raft::common::nvtx::range scope("fj_solve"); - timer_t timer(settings.time_limit); + bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + if (deterministic) { + settings.time_limit = std::max((f_t)0.0, settings.time_limit); + settings.work_limit = settings.time_limit; + } handle_ptr = const_cast(solution.handle_ptr); pb_ptr = solution.problem_ptr; last_reported_objective_ = std::numeric_limits::infinity(); @@ -1082,9 +1290,26 @@ i_t fj_t::solve(solution_t& solution) cuopt_func_call(solution.test_variable_bounds(true)); cuopt_assert(solution.test_number_all_integer(), "All integers must be rounded"); } + if (deterministic && settings.work_limit == 0.0) { + CUOPT_LOG_DEBUG("FJ: skipping solve due to exhausted deterministic work budget"); + return solution.compute_feasibility(); + } + auto total_work_start = context.gpu_heur_loop.current_work(); + auto total_time_start = std::chrono::high_resolution_clock::now(); pb_ptr->check_problem_representation(true); resize_vectors(solution.handle_ptr); + CUOPT_LOG_DEBUG( + "FJ: work_limit %f time_limit %f sol hash %x pb hash %x", + settings.work_limit < std::numeric_limits::max() ? settings.work_limit : -1.0, + settings.time_limit < std::numeric_limits::max() ? settings.time_limit : -1.0, + solution.get_hash(), + pb_ptr->get_fingerprint()); + CUOPT_LOG_DEBUG("FJ: weights hash %x, left weights hash %x, right weights hash %x", + detail::compute_hash(cstr_weights, handle_ptr->get_stream()), + detail::compute_hash(cstr_left_weights, handle_ptr->get_stream()), + detail::compute_hash(cstr_right_weights, handle_ptr->get_stream())); + bool is_initial_feasible = solution.compute_feasibility(); auto initial_solution = solution; // if we're in rounding mode, split the time/iteration limit between the first and second stage @@ -1094,6 +1319,10 @@ i_t fj_t::solve(solution_t& solution) if (settings.mode == fj_mode_t::ROUNDING) { settings.time_limit = settings.time_limit * (1 - settings.parameters.rounding_second_stage_split); + if (deterministic) { + settings.work_limit = + settings.work_limit * (1 - settings.parameters.rounding_second_stage_split); + } settings.iteration_limit = settings.iteration_limit * (1 - settings.parameters.rounding_second_stage_split); } @@ -1119,17 +1348,25 @@ i_t fj_t::solve(solution_t& solution) RAFT_CHECK_CUDA(handle_ptr->get_stream()); handle_ptr->sync_stream(); + if (deterministic) { initialize_deterministic_work_estimator(); } + i_t iterations = host_loop(solution); RAFT_CHECK_CUDA(handle_ptr->get_stream()); handle_ptr->sync_stream(); - f_t effort_rate = (f_t)iterations / timer.elapsed_time(); + f_t elapsed_time = std::chrono::duration_cast>( + std::chrono::high_resolution_clock::now() - total_time_start) + .count(); + f_t effort_rate = (f_t)iterations / elapsed_time; // If we're in rounding mode and some fractionals remain: round them all // limit = total_limit * second_stage_split if (settings.mode == fj_mode_t::ROUNDING && climbers[0]->fractional_variables.set_size.value(handle_ptr->get_stream()) > 0) { settings.time_limit = settings.time_limit * settings.parameters.rounding_second_stage_split; + if (deterministic) { + settings.work_limit = settings.work_limit * settings.parameters.rounding_second_stage_split; + } settings.iteration_limit = settings.iteration_limit * settings.parameters.rounding_second_stage_split; @@ -1141,7 +1378,7 @@ i_t fj_t::solve(solution_t& solution) } } - CUOPT_LOG_TRACE("GPU solver took %g", timer.elapsed_time()); + CUOPT_LOG_TRACE("GPU solver took %g", elapsed_time); CUOPT_LOG_TRACE("limit reached, effort rate %g steps/secm %d steps", effort_rate, iterations); reset_cuda_graph(); i_t n_integer_vars = thrust::count_if( @@ -1166,6 +1403,18 @@ i_t fj_t::solve(solution_t& solution) cuopt_assert(solution.compute_feasibility(), "Reverted solution should be feasible"); } + cuopt_func_call(solution.test_variable_bounds()); + + if (deterministic) { + auto total_work_end = context.gpu_heur_loop.current_work(); + CUOPT_LOG_DEBUG("FJ: worked %fwu for %d iterations, %g seconds", + total_work_end - total_work_start, + iterations, + elapsed_time); + } + + CUOPT_LOG_DEBUG("FJ sol hash %x", solution.get_hash()); + return is_new_feasible; } diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh index 50b451a86e..a68ba1c467 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh @@ -19,6 +19,9 @@ #include +#include +#include + #include #define FJ_DEBUG_LOAD_BALANCING 0 @@ -105,6 +108,7 @@ struct fj_settings_t { fj_mode_t mode{fj_mode_t::FIRST_FEASIBLE}; fj_candidate_selection_t candidate_selection{fj_candidate_selection_t::WEIGHTED_SCORE}; double time_limit{60.0}; + double work_limit{std::numeric_limits::infinity()}; int iteration_limit{std::numeric_limits::max()}; fj_hyper_parameters_t parameters{}; int n_of_minimums_for_exit = 7000; @@ -131,12 +135,17 @@ struct fj_move_t { bool operator!=(const fj_move_t& rhs) const { return !(*this == rhs); } }; -// TODO: use 32bit integers instead, -// as we dont need them to be floating point per the FJ2 scoring scheme // sizeof(fj_staged_score_t) <= 8 is needed to allow for atomic loads struct fj_staged_score_t { - float base{-std::numeric_limits::infinity()}; - float bonus{-std::numeric_limits::infinity()}; + int32_t base{std::numeric_limits::lowest()}; + int32_t bonus{std::numeric_limits::lowest()}; + + fj_staged_score_t() = default; + HDI fj_staged_score_t(int32_t base_, int32_t bonus_) : base(base_), bonus(bonus_) {} + fj_staged_score_t(const fj_staged_score_t&) = default; + fj_staged_score_t(fj_staged_score_t&&) = default; + fj_staged_score_t& operator=(const fj_staged_score_t&) = default; + fj_staged_score_t& operator=(fj_staged_score_t&&) = default; HDI bool operator<(fj_staged_score_t other) const noexcept { @@ -154,7 +163,7 @@ struct fj_staged_score_t { HDI static fj_staged_score_t invalid() { - return {-std::numeric_limits::infinity(), -std::numeric_limits::infinity()}; + return {std::numeric_limits::lowest(), std::numeric_limits::lowest()}; } HDI static fj_staged_score_t zero() { return {0, 0}; } @@ -268,6 +277,7 @@ class fj_t { rmm::device_uvector work_id_to_bin_var_idx; rmm::device_uvector work_id_to_nonbin_var_idx; rmm::device_uvector work_ids_for_related_vars; + rmm::device_uvector deterministic_frontier_work_by_var_d_; cudaGraphExec_t graph_instance; bool graph_created = false; @@ -326,6 +336,7 @@ class fj_t { rmm::device_scalar full_refresh_iteration; rmm::device_scalar relvar_count_last_update; rmm::device_scalar load_balancing_skip; + rmm::device_scalar deterministic_batch_work; contiguous_set_t violated_constraints; contiguous_set_t candidate_variables; @@ -420,6 +431,7 @@ class fj_t { last_iter_candidates(0, fj.handle_ptr->get_stream()), relvar_count_last_update(0, fj.handle_ptr->get_stream()), load_balancing_skip(0, fj.handle_ptr->get_stream()), + deterministic_batch_work(0.0, fj.handle_ptr->get_stream()), break_condition(0, fj.handle_ptr->get_stream()), temp_break_condition(0, fj.handle_ptr->get_stream()), cub_storage_bytes(0, fj.handle_ptr->get_stream()), @@ -490,6 +502,7 @@ class fj_t { raft::device_span row_size_nonbin_prefix_sum; raft::device_span work_id_to_bin_var_idx; raft::device_span work_id_to_nonbin_var_idx; + raft::device_span deterministic_frontier_work_by_var; i_t* selected_var; i_t* constraints_changed_count; @@ -518,6 +531,9 @@ class fj_t { i_t* relvar_count_last_update; i_t* load_balancing_skip; f_t* max_cstr_weight; + double* deterministic_batch_work; + double deterministic_refresh_work; + bool deterministic_work_accounting; fj_settings_t* settings; @@ -634,6 +650,19 @@ class fj_t { std::vector> climbers; rmm::device_uvector climber_views; fj_settings_t settings; + std::vector deterministic_frontier_work_by_var_; + double deterministic_average_frontier_work_{0.0}; + double deterministic_refresh_work_{0.0}; + + public: + void initialize_deterministic_work_estimator(); + void set_improvement_callback(fj_improvement_callback_t callback) + { + improvement_callback = std::move(callback); + } + + private: + bool use_load_balancing_codepath() const; fj_improvement_callback_t improvement_callback; f_t last_reported_objective_{std::numeric_limits::infinity()}; diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh index e57f0ec9e2..ec9b592550 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh @@ -103,7 +103,9 @@ HDI std::pair feas_score_constraint( f_t cstr_coeff, f_t c_lb, f_t c_ub, - f_t current_lhs) + f_t current_lhs, + f_t cstr_left_weight, + f_t cstr_right_weight) { cuopt_assert(isfinite(delta), "invalid delta"); cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient"); @@ -123,14 +125,13 @@ HDI std::pair feas_score_constraint( // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum) // likely unreasonable and overkill however - f_t cstr_weight = - bound_idx == 0 ? fj.cstr_left_weights[cstr_idx] : fj.cstr_right_weights[cstr_idx]; - f_t sign = bound_idx == 0 ? -1 : 1; - f_t rhs = bounds[bound_idx] * sign; - f_t old_lhs = current_lhs * sign; - f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; - f_t old_slack = rhs - old_lhs; - f_t new_slack = rhs - new_lhs; + f_t cstr_weight = bound_idx == 0 ? cstr_left_weight : cstr_right_weight; + f_t sign = bound_idx == 0 ? -1 : 1; + f_t rhs = bounds[bound_idx] * sign; + f_t old_lhs = current_lhs * sign; + f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; + f_t old_slack = rhs - old_lhs; + f_t new_slack = rhs - new_lhs; cuopt_assert(isfinite(cstr_weight), "invalid weight"); cuopt_assert(cstr_weight >= 0, "invalid weight"); diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu index ebbb761277..90f26ac4a5 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu @@ -14,6 +14,11 @@ #include +#include +#include + +#include + #include #include "feasibility_jump_impl_common.cuh" @@ -25,6 +30,39 @@ namespace cg = cooperative_groups; namespace cuopt::linear_programming::detail { +template +DI void charge_deterministic_iteration_work(typename fj_t::climber_data_t::view_t fj, + bool full_score_refresh) +{ + if (!fj.deterministic_work_accounting || !FIRST_THREAD) { return; } + + const i_t selected_var = *fj.selected_var; + + double work = fj.deterministic_refresh_work; + if (!full_score_refresh && selected_var >= 0 && + selected_var < static_cast(fj.deterministic_frontier_work_by_var.size())) { + work = fj.deterministic_frontier_work_by_var[selected_var]; + } + + *fj.deterministic_batch_work += work; +} + +template +struct score_with_tiebreaker_comparator { + DI auto operator()(const thrust::pair& a, + const thrust::pair& b) const + { + auto a_score = a.first; + auto a_idx = a.second; + auto b_score = b.first; + auto b_idx = b.second; + + if (a_score > b_score) return a; + if (a_score == b_score && a_idx > b_idx) return a; + return b; + } +}; + template DI thrust::pair move_objective_score( const typename fj_t::climber_data_t::view_t& fj, i_t var_idx, f_t delta) @@ -139,7 +177,8 @@ DI void update_weights(typename fj_t::climber_data_t::view_t& fj) } template -__global__ void init_lhs_and_violation(typename fj_t::climber_data_t::view_t fj) +__global__ void init_lhs_and_violated_constraints( + typename fj_t::climber_data_t::view_t fj) { for (i_t cstr_idx = TH_ID_X; cstr_idx < fj.pb.n_constraints; cstr_idx += GRID_STRIDE) { auto [offset_begin, offset_end] = fj.pb.range_for_constraint(cstr_idx); @@ -152,10 +191,7 @@ __global__ void init_lhs_and_violation(typename fj_t::climber_data_t:: fj_kahan_babushka_neumaier_sum(delta_it + offset_begin, delta_it + offset_end); fj.incumbent_lhs_sumcomp[cstr_idx] = 0; - f_t th_violation = fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]); - f_t weighted_violation = th_violation * fj.cstr_weights[cstr_idx]; - atomicAdd(fj.violation_score, th_violation); - atomicAdd(fj.weighted_violation_score, weighted_violation); + f_t th_violation = fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]); f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx); if (th_violation < -cstr_tolerance) { fj.violated_constraints.insert(cstr_idx); } } @@ -191,8 +227,17 @@ DI typename fj_t::move_score_info_t compute_new_score( f_t c_lb = fj.pb.constraint_lower_bounds[cstr_idx]; f_t c_ub = fj.pb.constraint_upper_bounds[cstr_idx]; - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj.incumbent_lhs[cstr_idx], + fj.cstr_left_weights[cstr_idx], + fj.cstr_right_weights[cstr_idx]); base_feas += cstr_base_feas; bonus_robust += cstr_bonus_robust; @@ -349,7 +394,7 @@ DI std::pair::move_score_info_t> compute_best_mtm( return std::make_pair(best_val, best_score_info); } -template +template DI void update_jump_value(typename fj_t::climber_data_t::view_t fj, i_t var_idx) { cuopt_assert(var_idx >= 0 && var_idx < fj.pb.n_variables, "invalid variable index"); @@ -376,12 +421,11 @@ DI void update_jump_value(typename fj_t::climber_data_t::view_t fj, i_ fj.pb.check_variable_within_bounds(var_idx, fj.incumbent_assignment[var_idx] + delta), "Var not within bounds!"); } - best_score_info = compute_new_score(fj, var_idx, delta); + best_score_info = compute_new_score(fj, var_idx, delta); } else { - auto [best_val, score_info] = - compute_best_mtm(fj, var_idx); - delta = best_val - fj.incumbent_assignment[var_idx]; - best_score_info = score_info; + auto [best_val, score_info] = compute_best_mtm(fj, var_idx); + delta = best_val - fj.incumbent_assignment[var_idx]; + best_score_info = score_info; } } else { delta = round(1.0 - 2 * fj.incumbent_assignment[var_idx]); @@ -577,14 +621,16 @@ __global__ void update_assignment_kernel(typename fj_t::climber_data_t __syncthreads(); - cuopt_assert(isfinite(fj.jump_move_delta[var_idx]), "delta should be finite"); - // Kahan compensated summation - // fj.incumbent_lhs[cstr_idx] = old_lhs + cstr_coeff * fj.jump_move_delta[var_idx]; - f_t y = cstr_coeff * fj.jump_move_delta[var_idx] - fj.incumbent_lhs_sumcomp[cstr_idx]; - f_t t = old_lhs + y; - fj.incumbent_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y; - fj.incumbent_lhs[cstr_idx] = t; - cuopt_assert(isfinite(fj.incumbent_lhs[cstr_idx]), "assignment should be finite"); + if (threadIdx.x == 0) { + cuopt_assert(isfinite(fj.jump_move_delta[var_idx]), "delta should be finite"); + // Kahan compensated summation + // fj.incumbent_lhs[cstr_idx] = old_lhs + cstr_coeff * fj.jump_move_delta[var_idx]; + f_t y = cstr_coeff * fj.jump_move_delta[var_idx] - fj.incumbent_lhs_sumcomp[cstr_idx]; + f_t t = old_lhs + y; + fj.incumbent_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y; + fj.incumbent_lhs[cstr_idx] = t; + cuopt_assert(isfinite(fj.incumbent_lhs[cstr_idx]), "assignment should be finite"); + } } // update the assignment and objective proper @@ -626,8 +672,8 @@ __global__ void update_assignment_kernel(typename fj_t::climber_data_t #if FJ_SINGLE_STEP DEVICE_LOG_DEBUG( - "=---- FJ[%d]: updated %d [%g/%g] :%.4g+{%.4g}=%.4g score {%g,%g}, d_obj %.2g+%.2g=%.2g, " - "err_range %.2g%%, infeas %.2g, total viol %d\n", + "=---- FJ[%d]: updated %d [%g/%g] :%.4g+{%.4g}=%.4g score {%d,%d}, d_obj %.2g+%.2g=%.2g, " + "err_range %.2g%%, infeas %.2g, total viol %d, obj %x, delta %x, coef %x\n", *fj.iterations, var_idx, get_lower(fj.pb.variable_bounds[var_idx]), @@ -642,7 +688,10 @@ __global__ void update_assignment_kernel(typename fj_t::climber_data_t *fj.incumbent_objective + fj.jump_move_delta[var_idx] * fj.pb.objective_coefficients[var_idx], delta_rel_err, fj.jump_move_infeasibility[var_idx], - fj.violated_constraints.size()); + fj.violated_constraints.size(), + detail::compute_hash(*fj.incumbent_objective), + detail::compute_hash(fj.jump_move_delta[var_idx]), + detail::compute_hash(fj.pb.objective_coefficients[var_idx])); #endif // reset the score fj.jump_move_scores[var_idx] = fj_t::move_score_t::invalid(); @@ -862,6 +911,16 @@ DI void update_changed_constraints(typename fj_t::climber_data_t::view if (blockIdx.x == 0) { if (threadIdx.x == 0) { + // sort changed constraints to guarantee determinism + // TODO: usually csontraint changed few, but thats still rather dreadful... + // block-parallelize at least? but not trivial for arbitrary sizes w/ CUB + // TODO: replace once focus shifts to tuning deterministic GPU heuristics + if (fj.deterministic_work_accounting) { + thrust::sort(thrust::seq, + fj.constraints_changed.begin(), + fj.constraints_changed.begin() + *fj.constraints_changed_count); + } + for (i_t i = 0; i < *fj.constraints_changed_count; ++i) { i_t idx = fj.constraints_changed[i]; if ((idx & 1) == CONSTRAINT_FLAG_INSERT) { @@ -953,7 +1012,7 @@ __global__ void compute_iteration_related_variables_kernel( compute_iteration_related_variables(fj); } -template +template __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_t fj, bool ForceRefresh) { @@ -965,11 +1024,14 @@ __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_ if (*fj.selected_var == std::numeric_limits::max()) full_refresh = true; // always do a full sweep when looking for satisfied mtm moves - if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) full_refresh = true; - - // only update related variables i_t split_begin, split_end; - if (full_refresh) { + if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) { + full_refresh = true; + split_begin = 0; + split_end = fj.objective_vars.size(); + } + // only update related variables + else if (full_refresh) { split_begin = 0; split_end = fj.pb.n_variables; } @@ -989,12 +1051,20 @@ __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_ split_end = range.second; } + charge_deterministic_iteration_work(fj, full_refresh); + if (FIRST_THREAD) *fj.relvar_count_last_update = split_end - split_begin; for (i_t i = blockIdx.x + split_begin; i < split_end; i += gridDim.x) { - i_t var_idx = full_refresh ? i - : fj.pb.related_variables.size() == 0 ? i - : fj.pb.related_variables[i]; + // if sat MTM mode, go over objective variables only + i_t var_idx; + if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) { + var_idx = fj.objective_vars[i]; + } else { + var_idx = full_refresh ? i + : fj.pb.related_variables.size() == 0 ? i + : fj.pb.related_variables[i]; + } // skip if we couldnt precompute a related var table and // this variable isnt in the dynamic related variable table @@ -1017,7 +1087,7 @@ __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_ } cuopt_assert(var_idx >= 0 && var_idx < fj.pb.n_variables, ""); - update_jump_value(fj, var_idx); + update_jump_value(fj, var_idx); } } @@ -1025,7 +1095,7 @@ template __global__ void compute_mtm_moves_kernel(typename fj_t::climber_data_t::view_t fj, bool ForceRefresh) { - compute_mtm_moves(fj, ForceRefresh); + compute_mtm_moves(fj, ForceRefresh); } template @@ -1037,8 +1107,9 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: fj.settings->seed, *fj.iterations * fj.settings->parameters.max_sampled_moves, 0); using move_score_t = typename fj_t::move_score_t; - __shared__ alignas(move_score_t) char shmem_storage[2 * raft::WarpSize * sizeof(move_score_t)]; - auto* const shmem = (move_score_t*)shmem_storage; + __shared__ alignas(thrust::pair) char + shmem_storage[raft::WarpSize * sizeof(thrust::pair)]; + auto* const shmem = (thrust::pair*)shmem_storage; auto th_best_score = fj_t::move_score_t::invalid(); i_t th_selected_var = std::numeric_limits::max(); @@ -1075,8 +1146,11 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: } } // Block level reduction to get the best variable from the sample + // Use deterministic tie-breaking comparator based on var_idx auto [best_score, reduced_selected_var] = - raft::blockRankedReduce(th_best_score, shmem, th_selected_var, raft::max_op{}); + raft::blockReduce(thrust::make_pair(th_best_score, th_selected_var), + (char*)shmem, + score_with_tiebreaker_comparator{}); if (FIRST_THREAD) { // assign it to print the value outside th_best_score = best_score; @@ -1111,9 +1185,9 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: i_t var_range = get_upper(bounds) - get_lower(bounds); double delta_rel_err = fabs(fj.jump_move_delta[selected_var]) / var_range * 100; DEVICE_LOG_INFO( - "=---- FJ: selected %d [%g/%g] %c :%.4g+{%.4g}=%.4g score {%g,%g}, d_obj %.2g+%.2g->%.2g, " + "=---- FJ: selected %d [%g/%g] %c :%.4g+{%.4g}=%.4g score {%d,%d}, d_obj %.2g+%.2g->%.2g, " "delta_rel_err %.2g%%, " - "infeas %.2g, total viol %d, out of %d\n", + "infeas %.2g, total viol %d, out of %d, obj %x\n", selected_var, get_lower(bounds), get_upper(bounds), @@ -1130,9 +1204,18 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: delta_rel_err, fj.jump_move_infeasibility[selected_var], fj.violated_constraints.size(), - good_var_count); + good_var_count, + detail::compute_hash(*fj.incumbent_objective)); #endif cuopt_assert(fj.jump_move_scores[selected_var].valid(), ""); + } else { +#if FJ_SINGLE_STEP + DEVICE_LOG_INFO("=[%d]---- FJ: no var selected, obj is %g, viol %d, out of %d\n", + *fj.iterations, + *fj.incumbent_objective, + fj.violated_constraints.size(), + good_var_count); +#endif } } } @@ -1202,27 +1285,32 @@ DI thrust::tuple::move_score_t> gridwide_reduc if (blockIdx.x == 0) { using move_score_t = typename fj_t::move_score_t; - __shared__ alignas(move_score_t) char shmem_storage[2 * raft::WarpSize * sizeof(move_score_t)]; - auto* const shmem = (move_score_t*)shmem_storage; + __shared__ alignas(thrust::pair) char + shmem_storage[2 * raft::WarpSize * sizeof(thrust::pair)]; + auto* const shmem = (thrust::pair*)shmem_storage; auto th_best_score = fj_t::move_score_t::invalid(); i_t th_best_block = 0; + i_t th_best_var = -1; for (i_t i = threadIdx.x; i < gridDim.x; i += blockDim.x) { auto var_idx = fj.grid_var_buf[i]; auto move_score = fj.grid_score_buf[i]; - if (move_score > th_best_score || - (move_score == th_best_score && var_idx > fj.grid_var_buf[th_best_block])) { + if (move_score > th_best_score || (move_score == th_best_score && var_idx > th_best_var)) { th_best_score = move_score; th_best_block = i; + th_best_var = var_idx; } } // Block level reduction to get the best variable from all blocks - auto [reduced_best_score, reduced_best_block] = - raft::blockRankedReduce(th_best_score, shmem, th_best_block, raft::max_op{}); - - if (reduced_best_score.valid() && threadIdx.x == 0) { - cuopt_assert(th_best_block < gridDim.x, ""); + auto [reduced_best_score_pair, reduced_best_block] = + raft::blockRankedReduce(thrust::make_pair(th_best_score, th_best_var), + shmem, + th_best_block, + score_with_tiebreaker_comparator{}); + + if (reduced_best_score_pair.first.valid() && threadIdx.x == 0) { + cuopt_assert(reduced_best_block < gridDim.x, ""); best_var = fj.grid_var_buf[reduced_best_block]; best_delta = fj.grid_delta_buf[reduced_best_block]; best_score = fj.grid_score_buf[reduced_best_block]; @@ -1244,6 +1332,9 @@ DI thrust::tuple::move_score_t> best_random_mt raft::random::PCGenerator rng(fj.settings->seed + *fj.iterations, 0, 0); i_t cstr_idx = fj.violated_constraints.contents[rng.next_u32() % fj.violated_constraints.size()]; + cuopt_assert(fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]) < 0, + "constraint isn't violated"); + auto [offset_begin, offset_end] = fj.pb.range_for_constraint(cstr_idx); return gridwide_reduce_best_move( @@ -1258,7 +1349,9 @@ DI thrust::tuple::move_score_t> best_sat_cstr_ typename fj_t::climber_data_t::view_t fj) { // compute all MTM moves within satisfied constraints - compute_mtm_moves(fj, true); + compute_mtm_moves(fj, true); + // NOTE: grid sync not required since each block only reduces over variables that it updated in + // compute_mtm_moves return gridwide_reduce_best_move( fj, fj.objective_vars.begin(), fj.objective_vars.end(), [fj] __device__(i_t var_idx) { return fj.jump_move_delta[var_idx]; @@ -1413,9 +1506,10 @@ __global__ void handle_local_minimum_kernel(typename fj_t::climber_dat if (sat_best_score.base > 0 && sat_best_score > best_score) { if (FIRST_THREAD) { - best_score = sat_best_score; - best_var = sat_best_var; - best_delta = sat_best_delta; + best_score = sat_best_score; + best_var = sat_best_var; + best_delta = sat_best_delta; + best_movetype = 'S'; } } } @@ -1427,6 +1521,15 @@ __global__ void handle_local_minimum_kernel(typename fj_t::climber_dat best_var, fj.incumbent_assignment[best_var] + best_delta), "assignment not within bounds"); fj.jump_move_delta[best_var] = best_delta; +#if FJ_SINGLE_STEP + DEVICE_LOG_DEBUG("FJ[%d] selected_var: %d, delta %g, score {%d %d}, type %c\n", + *fj.iterations, + best_var, + best_delta, + best_score.base, + best_score.bonus, + best_movetype); +#endif } } } @@ -1458,7 +1561,7 @@ __global__ void handle_local_minimum_kernel(typename fj_t::climber_dat const __grid_constant__ typename fj_t::climber_data_t::view_t fj); \ template __global__ void load_balancing_sanity_checks( \ const __grid_constant__ typename fj_t::climber_data_t::view_t fj); \ - template __global__ void init_lhs_and_violation( \ + template __global__ void init_lhs_and_violated_constraints( \ typename fj_t::climber_data_t::view_t fj); \ template __global__ void update_lift_moves_kernel( \ typename fj_t::climber_data_t::view_t fj); \ diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh index 55fd4e61f1..9b99cdeb21 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh @@ -52,7 +52,8 @@ __global__ void load_balancing_mtm_compute_scores( const __grid_constant__ typename fj_t::climber_data_t::view_t fj); template -__global__ void init_lhs_and_violation(typename fj_t::climber_data_t::view_t fj); +__global__ void init_lhs_and_violated_constraints( + typename fj_t::climber_data_t::view_t fj); // Update the jump move tables after the best jump value has been computed for a "heavy" variable template diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu index b16f299bf1..34634959c8 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu @@ -11,16 +11,20 @@ #include "feasibility_jump_impl_common.cuh" #include "fj_cpu.cuh" +#include #include #include +#include +#include + +#include #include -#include -#include +#include +#include #include #include -#include #include #include @@ -38,6 +42,24 @@ namespace cuopt::linear_programming::detail { +namespace { + +double read_positive_work_unit_scale(const char* env_name) +{ + const char* env_value = std::getenv(env_name); + if (env_value == nullptr || env_value[0] == '\0') { return 1.0; } + + errno = 0; + char* end_ptr = nullptr; + const double parsed_value = std::strtod(env_value, &end_ptr); + const bool valid_value = errno == 0 && end_ptr != env_value && *end_ptr == '\0' && + std::isfinite(parsed_value) && parsed_value > 0.0; + cuopt_assert(valid_value, "Invalid CPUFJ work-unit scale env var"); + return parsed_value; +} + +} // namespace + template thrust::tuple get_mtm_for_bound(const typename fj_t::climber_data_t::view_t& fj, i_t var_idx, @@ -107,99 +129,6 @@ thrust::tuple get_mtm_for_constraint( return {delta_ij, sign, slack, cstr_tolerance}; } -template -std::pair feas_score_constraint(const typename fj_t::climber_data_t::view_t& fj, - i_t var_idx, - f_t delta, - i_t cstr_idx, - f_t cstr_coeff, - f_t c_lb, - f_t c_ub, - f_t current_lhs, - f_t left_weight, - f_t right_weight) -{ - cuopt_assert(isfinite(delta), "invalid delta"); - cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient"); - - f_t base_feas = 0; - f_t bonus_robust = 0; - - f_t bounds[2] = {c_lb, c_ub}; - cuopt_assert(isfinite(c_lb) || isfinite(c_ub), "no range"); - for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) { - if (!isfinite(bounds[bound_idx])) continue; - - // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into - // two virtual leq constraints "lhs <= ub" and "-lhs <= -lb" in order to match - // the convention of the paper - - // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums - // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum) - // likely unreasonable and overkill however - f_t cstr_weight = bound_idx == 0 ? left_weight : right_weight; - f_t sign = bound_idx == 0 ? -1 : 1; - f_t rhs = bounds[bound_idx] * sign; - f_t old_lhs = current_lhs * sign; - f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; - f_t old_slack = rhs - old_lhs; - f_t new_slack = rhs - new_lhs; - - cuopt_assert(isfinite(cstr_weight), "invalid weight"); - cuopt_assert(cstr_weight >= 0, "invalid weight"); - cuopt_assert(isfinite(old_lhs), ""); - cuopt_assert(isfinite(new_lhs), ""); - cuopt_assert(isfinite(old_slack) && isfinite(new_slack), ""); - - f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx, c_lb, c_ub); - - bool old_viol = fj.excess_score(cstr_idx, current_lhs, c_lb, c_ub) < -cstr_tolerance; - bool new_viol = - fj.excess_score(cstr_idx, current_lhs + cstr_coeff * delta, c_lb, c_ub) < -cstr_tolerance; - - bool old_sat = old_lhs < rhs + cstr_tolerance; - bool new_sat = new_lhs < rhs + cstr_tolerance; - - // equality - if (fj.pb.integer_equal(c_lb, c_ub)) { - if (!old_viol) cuopt_assert(old_sat == !old_viol, ""); - if (!new_viol) cuopt_assert(new_sat == !new_viol, ""); - } - - // if it would feasibilize this constraint - if (!old_sat && new_sat) { - cuopt_assert(old_viol, ""); - base_feas += cstr_weight; - } - // would cause this constraint to be violated - else if (old_sat && !new_sat) { - cuopt_assert(new_viol, ""); - base_feas -= cstr_weight; - } - // simple improvement - else if (!old_sat && !new_sat && old_lhs > new_lhs) { - cuopt_assert(old_viol && new_viol, ""); - base_feas += (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight); - } - // simple worsening - else if (!old_sat && !new_sat && old_lhs <= new_lhs) { - cuopt_assert(old_viol && new_viol, ""); - base_feas -= (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight); - } - - // robustness score bonus if this would leave some strick slack - bool old_stable = old_lhs < rhs - cstr_tolerance; - bool new_stable = new_lhs < rhs - cstr_tolerance; - if (!old_stable && new_stable) { - bonus_robust += cstr_weight; - } else if (old_stable && !new_stable) { - bonus_robust -= cstr_weight; - } - } - - return {base_feas, bonus_robust}; -} - static constexpr double BIGVAL_THRESHOLD = 1e20; template @@ -1401,6 +1330,15 @@ std::unique_ptr> fj_t::create_cpu_climber( // Initialize fj_cpu with all the data init_fj_cpu(*fj_cpu, solution, left_weights, right_weights, objective_weight); + const double cpu_work_unit_scale = + context.settings.cpufj_work_unit_scale != 1.0 + ? context.settings.cpufj_work_unit_scale + : read_positive_work_unit_scale("CUOPT_CPUFJ_WORK_UNIT_SCALE"); + fj_cpu->work_unit_bias *= cpu_work_unit_scale; + if (cpu_work_unit_scale != 1.0) { + CUOPT_DETERMINISM_LOG( + "CPUFJ using work-unit scale %f (bias=%f)", cpu_work_unit_scale, fj_cpu->work_unit_bias); + } fj_cpu->settings = settings; if (randomize_params) { auto rng = std::mt19937(cuopt::seed_generator::get_seed()); @@ -1550,6 +1488,10 @@ static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_lim fj_cpu.work_units_elapsed += biased_work; if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); } + + if (fj_cpu.work_units_elapsed.load(std::memory_order_relaxed) >= fj_cpu.work_budget) { + break; + } } cuopt_func_call(sanity_checks(fj_cpu)); diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh index 3263609a2b..4124bd079a 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh @@ -154,7 +154,8 @@ struct fj_cpu_climber_t { // Work unit tracking for deterministic synchronization std::atomic work_units_elapsed{0.0}; - double work_unit_bias{1.5}; // Bias factor to keep CPUFJ ahead of B&B + double work_unit_bias{1.5}; // Bias factor to keep CPUFJ ahead of B&B + double work_budget{std::numeric_limits::infinity()}; producer_sync_t* producer_sync{nullptr}; // Optional sync utility for notifying progress std::atomic halted{false}; diff --git a/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh b/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh index dfc9b3c885..8b77367ac4 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -120,16 +120,19 @@ __global__ void load_balancing_prepare_iteration(const __grid_constant__ typename fj_t::climber_data_t::view_t fj) { bool full_refresh = needs_full_refresh(fj); + charge_deterministic_iteration_work(fj, full_refresh); // alternate codepath in the case of a small related_var/total_var ratio if (!full_refresh && fj.pb.related_variables.size() > 0 && fj.pb.n_variables / fj.work_ids_for_related_vars[*fj.selected_var] >= - fj.settings->parameters.old_codepath_total_var_to_relvar_ratio_threshold) { + fj.settings->parameters.old_codepath_total_var_to_relvar_ratio_threshold && + fj.settings->load_balancing_mode != fj_load_balancing_mode_t::ALWAYS_ON) { auto range = fj.pb.range_for_related_vars(*fj.selected_var); for (i_t i = blockIdx.x + range.first; i < range.second; i += gridDim.x) { i_t var_idx = fj.pb.related_variables[i]; - update_jump_value(fj, var_idx); + update_jump_value(fj, + var_idx); } if (FIRST_THREAD) *fj.load_balancing_skip = true; @@ -334,8 +337,17 @@ __global__ void load_balancing_compute_scores_binary( auto c_lb = fj.constraint_lower_bounds_csr[csr_offset]; auto c_ub = fj.constraint_upper_bounds_csr[csr_offset]; - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj.incumbent_lhs[cstr_idx], + fj.cstr_left_weights[cstr_idx], + fj.cstr_right_weights[cstr_idx]); base_feas += cstr_base_feas; bonus_robust += cstr_bonus_robust; @@ -526,8 +538,8 @@ __launch_bounds__(TPB_loadbalance, 16) __global__ auto& score_info = candidate.score; - f_t base_feas = 0; - f_t bonus_robust = 0; + int32_t base_feas = 0; + int32_t bonus_robust = 0; // same as for the binary var kernel, compute each score compoenent per thread // and merge then via a wapr reduce @@ -535,8 +547,17 @@ __launch_bounds__(TPB_loadbalance, 16) __global__ cuopt_assert(c_lb == fj.pb.constraint_lower_bounds[cstr_idx], "bound sanity check failed"); cuopt_assert(c_ub == fj.pb.constraint_upper_bounds[cstr_idx], "bound sanity check failed"); - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj.incumbent_lhs[cstr_idx], + fj.cstr_left_weights[cstr_idx], + fj.cstr_right_weights[cstr_idx]); base_feas += cstr_base_feas; bonus_robust += cstr_bonus_robust; @@ -565,24 +586,29 @@ __launch_bounds__(TPB_loadbalance, 16) __global__ best_score_ref{fj.jump_move_scores[var_idx]}; auto best_score = best_score_ref.load(cuda::memory_order_relaxed); + cuda::atomic_ref best_delta_ref{ + fj.jump_move_delta[var_idx]}; + auto best_delta = best_delta_ref.load(cuda::memory_order_relaxed); + if (best_score < candidate.score || - (best_score == candidate.score && candidate.delta < fj.jump_move_delta[var_idx])) { + (best_score == candidate.score && candidate.delta < best_delta)) { // update the best move delta acquire_lock(&fj.jump_locks[var_idx]); // reject this move if it would increase the target variable to a numerically unstable // value - if (!fj.move_numerically_stable(fj.incumbent_assignment[var_idx], - fj.incumbent_assignment[var_idx] + delta, - base_feas, - *fj.violation_score)) { - fj.jump_move_scores[var_idx] = fj_t::move_score_t::invalid(); - } else if (fj.jump_move_scores[var_idx] < candidate.score - // determinism for ease of debugging - || (fj.jump_move_scores[var_idx] == candidate.score && - candidate.delta < fj.jump_move_delta[var_idx])) { - fj.jump_move_delta[var_idx] = candidate.delta; - fj.jump_move_scores[var_idx] = candidate.score; + // only skip updating, don't invalidate existing valid moves + if (fj.move_numerically_stable(fj.incumbent_assignment[var_idx], + fj.incumbent_assignment[var_idx] + delta, + base_feas, + *fj.violation_score)) { + if (fj.jump_move_scores[var_idx] < candidate.score + // determinism for ease of debugging + || (fj.jump_move_scores[var_idx] == candidate.score && + candidate.delta < fj.jump_move_delta[var_idx])) { + fj.jump_move_delta[var_idx] = candidate.delta; + fj.jump_move_scores[var_idx] = candidate.score; + } } release_lock(&fj.jump_locks[var_idx]); } @@ -644,7 +670,7 @@ __global__ void load_balancing_sanity_checks(const __grid_constant__ if (!(score_1 == score_1.invalid() && score_2 == score_2.invalid()) && !(v.pb.integer_equal(score_1.base, score_2.base) && v.pb.integer_equal(score_1.bonus, score_2.bonus))) { - printf("(iter %d) [%d, int:%d]: delta %g/%g was %f/%f, is %f/%f\n", + printf("(iter %d) [%d, int:%d]: delta %g/%g was %d/%d, is %d/%d\n", *v.iterations, var_idx, v.pb.is_integer_var(var_idx), diff --git a/cpp/src/mip_heuristics/feasibility_jump/utils.cuh b/cpp/src/mip_heuristics/feasibility_jump/utils.cuh index d98686bcc6..a16567b092 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/utils.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/utils.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -10,6 +10,7 @@ #include "feasibility_jump.cuh" #include +#include #include #include #include @@ -133,6 +134,23 @@ struct contiguous_set_t { validity_bitmap.resize(size, stream); } + void sort(const rmm::cuda_stream_view& stream) + { + thrust::sort( + rmm::exec_policy(stream), contents.begin(), contents.begin() + set_size.value(stream)); + thrust::fill(rmm::exec_policy(stream), index_map.begin(), index_map.end(), -1); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(set_size.value(stream)), + [v = view()] __device__(i_t idx) { v.index_map[v.contents[idx]] = idx; }); + + // only useful for debugging and ensuring the same hashes are printed +#if FJ_SINGLE_STEP + thrust::fill( + rmm::exec_policy(stream), contents.begin() + set_size.value(stream), contents.end(), 0); +#endif + } + struct view_t { i_t* set_size; i_t* lock; diff --git a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu index 0a17e3ebfd..6818c87ad4 100644 --- a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu +++ b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu @@ -29,6 +29,14 @@ #include #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif namespace cuopt::linear_programming::detail { template @@ -52,7 +60,7 @@ feasibility_pump_t::feasibility_pump_t( context.problem_ptr->handle_ptr->get_stream()), lp_optimal_solution(lp_optimal_solution_), rng(cuopt::seed_generator::get_seed()), - timer(20.) + timer(20., *context.termination) { } @@ -147,18 +155,36 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_t temp_p(*solution.problem_ptr); auto h_integer_indices = cuopt::host_copy(solution.problem_ptr->integer_indices, solution.handle_ptr->get_stream()); + cuopt_assert(h_assignment.size() == solution.problem_ptr->n_variables, "Size mismatch"); + cuopt_assert(h_last_projection.size() == solution.problem_ptr->n_variables, "Size mismatch"); + cuopt_assert(h_variable_bounds.size() == solution.problem_ptr->n_variables, "Size mismatch"); + CUOPT_DETERMINISM_LOG( + "FP proj inputs: assign_hash=0x%x last_proj_hash=0x%x integer_idx_hash=0x%x n_vars=%d n_int=%d", + detail::compute_hash(h_assignment), + detail::compute_hash(h_last_projection), + detail::compute_hash(h_integer_indices), + solution.problem_ptr->n_variables, + solution.problem_ptr->n_integer_vars); f_t obj_offset = 0; + i_t n_at_upper = 0; + i_t n_at_lower = 0; + i_t n_interior = 0; + std::vector interior_integer_indices; + interior_integer_indices.reserve(h_integer_indices.size()); // for each integer add the variable and the distance constraints for (auto i : h_integer_indices) { + cuopt_assert(i >= 0 && i < solution.problem_ptr->n_variables, "Index out of bounds"); auto h_var_bounds = h_variable_bounds[i]; if (solution.problem_ptr->integer_equal(h_assignment[i], get_upper(h_var_bounds))) { obj_offset += get_upper(h_var_bounds); // set the objective weight to -1, u - x obj_coefficients[i] = -1; + n_at_upper++; } else if (solution.problem_ptr->integer_equal(h_assignment[i], get_lower(h_var_bounds))) { obj_offset -= get_lower(h_var_bounds); // set the objective weight to +1, x - l obj_coefficients[i] = 1; + n_at_lower++; } else { // objective weight is 1 const f_t obj_weight = 1.; @@ -183,9 +209,30 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_t constr_coeffs_2{1, 1}; h_constraints.add_constraint( constr_indices, constr_coeffs_2, h_assignment[i], (f_t)default_cont_upper); + n_interior++; + interior_integer_indices.push_back(i); } } + CUOPT_DETERMINISM_LOG( + "FP proj build: at_lower=%d at_upper=%d interior=%d interior_idx_hash=0x%x obj_hash=0x%x " + "assign_aug_hash=0x%x vars_added=%d cstr_added=%d cstr_var_hash=0x%x cstr_coeff_hash=0x%x " + "cstr_offset_hash=0x%x cstr_lb_hash=0x%x cstr_ub_hash=0x%x", + n_at_lower, + n_at_upper, + n_interior, + detail::compute_hash(interior_integer_indices), + detail::compute_hash(obj_coefficients), + detail::compute_hash(h_assignment), + h_variables.size(), + h_constraints.n_constraints(), + detail::compute_hash(h_constraints.constraint_variables), + detail::compute_hash(h_constraints.constraint_coefficients), + detail::compute_hash(h_constraints.constraint_offsets), + detail::compute_hash(h_constraints.constraint_lower_bounds), + detail::compute_hash(h_constraints.constraint_upper_bounds)); adjust_objective_with_original(solution, obj_coefficients, longer_lp_run); + CUOPT_DETERMINISM_LOG("FP proj adjusted objective hash=0x%x", + detail::compute_hash(obj_coefficients)); // commit all the changes that were done by the host if (h_variables.size() > 0) { temp_p.insert_variables(h_variables); } if (h_constraints.n_constraints() > 0) { temp_p.insert_constraints(h_constraints); } @@ -196,6 +243,12 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_tget_stream()), + temp_p.n_variables, + temp_p.n_constraints); // copy new objective coefficients raft::copy(temp_p.objective_coefficients.data(), obj_coefficients.data(), @@ -210,13 +263,22 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_t::round(solution_t& solution) { bool result; CUOPT_LOG_DEBUG("Rounding the point"); - timer_t bounds_prop_timer(std::max(0.05, std::min(0.5, timer.remaining_time() / 10.))); + const int64_t seed_before = cuopt::seed_generator::peek_seed(); + const uint32_t hash_before = solution.get_hash(); + CUOPT_DETERMINISM_LOG("FP round entry: hash=0x%x seed=%lld rem=%.6f", + hash_before, + (long long)seed_before, + timer.remaining_time()); + + f_t bounds_prop_time_limit = std::min((f_t)0.5, timer.remaining_time() / 10.); + if (timer.deterministic) { + bounds_prop_time_limit = std::max((f_t)0.0, bounds_prop_time_limit); + } else { + bounds_prop_time_limit = std::max((f_t)0.05, bounds_prop_time_limit); + } + termination_checker_t bounds_prop_timer( + context.gpu_heur_loop, bounds_prop_time_limit, *context.termination); const f_t lp_run_time_after_feasible = 0.; bool old_var = constraint_prop.round_all_vars; f_t old_time = constraint_prop.max_time_for_bounds_prop; @@ -264,6 +340,15 @@ bool feasibility_pump_t::round(solution_t& solution) solution.assignment.data(), solution.assignment.size(), solution.handle_ptr->get_stream()); + + const int64_t seed_after = cuopt::seed_generator::peek_seed(); + CUOPT_DETERMINISM_LOG("FP round exit: hash=0x%x seed=%lld seed_delta=%lld feasible=%d rem=%.6f", + solution.get_hash(), + (long long)seed_after, + (long long)(seed_after - seed_before), + (int)result, + timer.remaining_time()); + if (result) { CUOPT_LOG_DEBUG("New feasible solution with objective %g", solution.get_user_objective()); } @@ -308,6 +393,13 @@ bool feasibility_pump_t::test_fj_feasible(solution_t& soluti fj.settings.feasibility_run = true; fj.settings.n_of_minimums_for_exit = 5000; fj.settings.time_limit = std::min(time_limit, timer.remaining_time()); + if (timer.deterministic) { + fj.settings.time_limit = std::max((f_t)0.0, fj.settings.time_limit); + if (fj.settings.time_limit == 0.0) { + CUOPT_LOG_DEBUG("Skipping 20%% FJ run due to exhausted deterministic work budget"); + return false; + } + } cuopt_func_call(solution.test_variable_bounds(true)); is_feasible = fj.solve(solution); cuopt_func_call(solution.test_variable_bounds(true)); @@ -472,14 +564,39 @@ template bool feasibility_pump_t::run_single_fp_descent(solution_t& solution) { raft::common::nvtx::range fun_scope("run_single_fp_descent"); + i_t fp_iter = 0; + CUOPT_DETERMINISM_LOG("FP descent start: hash=0x%x feas=%d obj=%.12f timer_det=%d rem=%.6f", + solution.get_hash(), + (int)solution.get_feasible(), + solution.get_user_objective(), + (int)timer.deterministic, + timer.remaining_time()); // start by doing nearest rounding solution.round_nearest(); + CUOPT_DETERMINISM_LOG("FP descent after initial round: hash=0x%x feas=%d obj=%.12f", + solution.get_hash(), + (int)solution.get_feasible(), + solution.get_user_objective()); + cuopt_assert(last_projection.size() == solution.assignment.size(), "Size mismatch"); + // First projection in a descent has no previous projection history: initialize explicitly + raft::copy(last_projection.data(), + solution.assignment.data(), + solution.assignment.size(), + solution.handle_ptr->get_stream()); raft::copy(last_rounding.data(), solution.assignment.data(), solution.assignment.size(), solution.handle_ptr->get_stream()); while (true) { - if (context.diversity_manager_ptr->check_b_b_preemption() || timer.check_time_limit()) { + CUOPT_DETERMINISM_LOG("FP iter %d pre-projection: hash=0x%x feas=%d obj=%.12f rem=%.6f", + fp_iter, + solution.get_hash(), + (int)solution.get_feasible(), + solution.get_user_objective(), + timer.remaining_time()); + bool preempt = context.diversity_manager_ptr != nullptr && + context.diversity_manager_ptr->check_b_b_preemption(); + if (preempt || timer.check_time_limit()) { CUOPT_LOG_DEBUG("FP time limit reached!"); round(solution); return false; @@ -489,10 +606,25 @@ bool feasibility_pump_t::run_single_fp_descent(solution_t& s f_t ratio_of_assigned_integers = f_t(solution.n_assigned_integers) / solution.problem_ptr->n_integer_vars; bool is_feasible = linear_project_onto_polytope(solution, ratio_of_assigned_integers); - i_t n_integers = solution.compute_number_of_integers(); + const f_t remaining_after_projection = timer.remaining_time(); + i_t n_integers = solution.compute_number_of_integers(); CUOPT_LOG_DEBUG("after fp projection n_integers %d total n_integes %d", n_integers, solution.problem_ptr->n_integer_vars); + CUOPT_DETERMINISM_LOG( + "FP iter %d post-projection: hash=0x%x feasible_after_lp=%d obj=%.12f rem=%.6f lp_stage=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_projection, + proj_begin - remaining_after_projection); + CUOPT_DETERMINISM_LOG("FP iter %d pre-round: hash=0x%x feas=%d obj=%.12f rem=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_projection); bool is_cycle = true; // temp comment for presolve run if (config.check_distance_cycle) { @@ -524,30 +656,71 @@ bool feasibility_pump_t::run_single_fp_descent(solution_t& s // run the LP with full precision to check if it actually is feasible const f_t lp_verify_time_limit = 5.; relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = lp_verify_time_limit; + lp_settings.time_limit = lp_verify_time_limit; + bool run_verify_lp = true; + if (timer.deterministic) { + const f_t remaining_work_limit = std::max((f_t)0.0, timer.remaining_time()); + lp_settings.work_limit = std::min(lp_verify_time_limit, remaining_work_limit); + lp_settings.time_limit = lp_settings.work_limit; + if (lp_settings.work_limit == 0.0) { + CUOPT_LOG_DEBUG( + "Skipping FP verification LP due to exhausted deterministic work budget"); + run_verify_lp = false; + } + } + lp_settings.work_context = timer.work_context; lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance; lp_settings.return_first_feasible = true; lp_settings.save_state = true; - run_lp_with_vars_fixed(*solution.problem_ptr, - solution, - solution.problem_ptr->integer_indices, - lp_settings, - &constraint_prop.bounds_update); - is_feasible = solution.get_feasible(); - n_integers = solution.compute_number_of_integers(); - if (is_feasible && n_integers == solution.problem_ptr->n_integer_vars) { - CUOPT_LOG_DEBUG("Feasible solution verified with LP!"); - return true; + if (run_verify_lp) { + run_lp_with_vars_fixed(*solution.problem_ptr, + solution, + solution.problem_ptr->integer_indices, + lp_settings, + &constraint_prop.bounds_update); + is_feasible = solution.get_feasible(); + n_integers = solution.compute_number_of_integers(); + if (is_feasible && n_integers == solution.problem_ptr->n_integer_vars) { + CUOPT_LOG_TRACE("Feasible solution verified with LP!"); + return true; + } } } } cuopt_func_call(solution.test_variable_bounds(false)); is_feasible = round(solution); cuopt_func_call(solution.test_variable_bounds(true)); - proj_and_round_time = proj_begin - timer.remaining_time(); + const f_t remaining_after_round = timer.remaining_time(); + proj_and_round_time = proj_begin - remaining_after_round; + CUOPT_DETERMINISM_LOG( + "FP iter %d post-round: hash=0x%x feasible_after_round=%d obj=%.12f rem=%.6f " + "round_stage=%.6f proj_round_total=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_round, + remaining_after_projection - remaining_after_round, + proj_and_round_time); if (!is_feasible) { const f_t time_ratio = 0.2; - is_feasible = test_fj_feasible(solution, time_ratio * proj_and_round_time); + const f_t fj_budget = time_ratio * proj_and_round_time; + CUOPT_DETERMINISM_LOG("FP iter %d pre-fj-fallback: hash=0x%x rem=%.6f fj_budget=%.6f", + fp_iter, + solution.get_hash(), + remaining_after_round, + fj_budget); + is_feasible = test_fj_feasible(solution, fj_budget); + const f_t remaining_after_fj = timer.remaining_time(); + CUOPT_DETERMINISM_LOG( + "FP iter %d post-fj-fallback: hash=0x%x feasible_after_fj=%d obj=%.12f rem=%.6f " + "fj_stage=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_fj, + remaining_after_round - remaining_after_fj); } if (timer.check_time_limit()) { CUOPT_LOG_DEBUG("FP time limit reached!"); @@ -576,6 +749,7 @@ bool feasibility_pump_t::run_single_fp_descent(solution_t& s return false; } cycle_queue.n_iterations_without_cycle++; + fp_iter++; } // unreachable return false; diff --git a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh index df3ad405e6..43c8a0592e 100644 --- a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh +++ b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -106,7 +107,6 @@ class feasibility_pump_t { feasibility_pump_t() = delete; feasibility_pump_t(mip_solver_context_t& context, fj_t& fj, - // fj_tree_t& fj_tree_, constraint_prop_t& constraint_prop_, line_segment_search_t& line_segment_search_, rmm::device_uvector& lp_optimal_solution_); @@ -128,7 +128,7 @@ class feasibility_pump_t { bool check_distance_cycle(solution_t& solution); void reset(); void resize_vectors(problem_t& problem, const raft::handle_t* handle_ptr); - bool random_round_with_fj(solution_t& solution, timer_t& round_timer); + bool random_round_with_fj(solution_t& solution, termination_checker_t& round_timer); bool round_multiple_points(solution_t& solution); void relax_general_integers(solution_t& solution); void revert_relaxation(solution_t& solution); @@ -137,7 +137,6 @@ class feasibility_pump_t { mip_solver_context_t& context; // keep a reference from upstream local search fj_t& fj; - // fj_tree_t& fj_tree; line_segment_search_t& line_segment_search; cycle_queue_t cycle_queue; constraint_prop_t& constraint_prop; @@ -156,7 +155,7 @@ class feasibility_pump_t { f_t proj_begin; i_t n_fj_single_descents; i_t max_n_of_integers = 0; - cuopt::timer_t timer; + cuopt::termination_checker_t timer; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu index ce70aec745..381a8a04e8 100644 --- a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu +++ b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu @@ -17,8 +17,10 @@ namespace cuopt::linear_programming::detail { template line_segment_search_t::line_segment_search_t( - fj_t& fj_, constraint_prop_t& constraint_prop_) - : fj(fj_), constraint_prop(constraint_prop_) + mip_solver_context_t& context_, + fj_t& fj_, + constraint_prop_t& constraint_prop_) + : context(context_), fj(fj_), constraint_prop(constraint_prop_) { } @@ -128,7 +130,7 @@ bool line_segment_search_t::search_line_segment( const rmm::device_uvector& point_2, const rmm::device_uvector& delta_vector, bool is_feasibility_run, - cuopt::timer_t& timer) + cuopt::termination_checker_t& timer) { CUOPT_LOG_DEBUG("Running line segment search with a given delta vector"); cuopt_assert(point_1.size() == point_2.size(), "size mismatch"); @@ -263,7 +265,7 @@ bool line_segment_search_t::search_line_segment(solution_t& const rmm::device_uvector& point_1, const rmm::device_uvector& point_2, bool is_feasibility_run, - cuopt::timer_t& timer) + cuopt::termination_checker_t& timer) { CUOPT_LOG_DEBUG("Running line segment search"); cuopt_assert(point_1.size() == point_2.size(), "size mismatch"); diff --git a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh index 30e169e9d9..d5e5596c70 100644 --- a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh +++ b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh @@ -9,7 +9,7 @@ #include #include -#include +#include namespace cuopt::linear_programming::detail { @@ -26,19 +26,21 @@ template class line_segment_search_t { public: line_segment_search_t() = delete; - line_segment_search_t(fj_t& fj, constraint_prop_t& constraint_prop); + line_segment_search_t(mip_solver_context_t& context, + fj_t& fj, + constraint_prop_t& constraint_prop); bool search_line_segment(solution_t& solution, const rmm::device_uvector& point_1, const rmm::device_uvector& point_2, bool is_feasibility_run, - cuopt::timer_t& timer); + cuopt::termination_checker_t& timer); bool search_line_segment(solution_t& solution, const rmm::device_uvector& point_1, const rmm::device_uvector& point_2, const rmm::device_uvector& delta_vector, bool is_feasibility_run, - cuopt::timer_t& timer); + cuopt::termination_checker_t& timer); void save_solution_if_better(solution_t& solution, const rmm::device_uvector& point_1, @@ -49,6 +51,7 @@ class line_segment_search_t { f_t& best_feasible_cost, f_t curr_cost); + mip_solver_context_t& context; fj_t& fj; constraint_prop_t& constraint_prop; line_segment_settings_t settings; diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index da29511d70..fa92ec624c 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -15,8 +15,9 @@ #include #include #include +#include #include -#include +#include #include @@ -24,6 +25,15 @@ #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { template @@ -36,7 +46,7 @@ local_search_t::local_search_t(mip_solver_context_t& context fj(context), // fj_tree(fj), constraint_prop(context), - line_segment_search(fj, constraint_prop), + line_segment_search(context, fj, constraint_prop), fp(context, fj, // fj_tree, @@ -54,18 +64,17 @@ local_search_t::local_search_t(mip_solver_context_t& context scratch_cpu_fj.push_back(std::make_unique>()); scratch_cpu_fj.back()->fj_ptr = &fj; scratch_cpu_fj_on_lp_opt.fj_ptr = &fj; + CUOPT_DETERMINISM_LOG("Deterministic solve start local_search state: seed_state=%lld", + (long long)cuopt::seed_generator::peek_seed()); fj.settings.n_of_minimums_for_exit = context.settings.heuristic_params.n_of_minimums_for_exit; } -static double local_search_best_obj = std::numeric_limits::max(); -static population_t* pop_ptr = nullptr; - template void local_search_t::start_cpufj_scratch_threads(population_t& population) { - pop_ptr = &population; - + cuopt_assert(!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS), + "Scratch CPUFJ must remain opportunistic-only"); std::vector default_weights(context.problem_ptr->n_constraints, 1.); solution_t solution(*context.problem_ptr); @@ -88,18 +97,9 @@ void local_search_t::start_cpufj_scratch_threads(population_tlog_prefix = "******* scratch " + std::to_string(counter) + ": "; cpu_fj.fj_cpu->improvement_callback = - [&population, problem_ptr = context.problem_ptr]( - f_t obj, const std::vector& h_vec, double /*work_units*/) { - population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - (void)problem_ptr; - if (obj < local_search_best_obj) { - CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", - problem_ptr->get_user_obj_from_solver_obj(obj), - problem_ptr->get_user_obj_from_solver_obj( - population.is_feasible() ? population.best_feasible().get_objective() - : std::numeric_limits::max())); - local_search_best_obj = obj; - } + [&population](f_t obj, const std::vector& h_vec, double /*work_units*/) { + population.add_external_solution( + h_vec, obj, internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP); }; counter++; }; @@ -113,7 +113,8 @@ template void local_search_t::start_cpufj_lptopt_scratch_threads( population_t& population) { - pop_ptr = &population; + cuopt_assert(!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS), + "LP-opt CPUFJ scratch must remain opportunistic-only"); std::vector default_weights(context.problem_ptr->n_constraints, 1.); @@ -125,16 +126,9 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = - [this, &population](f_t obj, const std::vector& h_vec, double /*work_units*/) { - population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - if (obj < local_search_best_obj) { - CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", - context.problem_ptr->get_user_obj_from_solver_obj(obj), - context.problem_ptr->get_user_obj_from_solver_obj( - population.is_feasible() ? population.best_feasible().get_objective() - : std::numeric_limits::max())); - local_search_best_obj = obj; - } + [&population](f_t obj, const std::vector& h_vec, double /*work_units*/) { + population.add_external_solution( + h_vec, obj, internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP); }; // default weights @@ -182,8 +176,11 @@ void local_search_t::start_cpufj_deterministic( // Set up callback to send solutions to B&B with work unit timestamps deterministic_cpu_fj.fj_cpu->improvement_callback = - [&bb](f_t obj, const std::vector& h_vec, double work_units) { - bb.queue_external_solution_deterministic(h_vec, work_units); + [&bb, problem_ptr = context.problem_ptr]( + f_t obj, const std::vector& h_vec, double work_units) { + f_t user_obj = problem_ptr->get_user_obj_from_solver_obj(obj); + bb.queue_external_solution_deterministic( + h_vec, user_obj, work_units, cuopt::internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP); }; deterministic_cpu_fj.start_cpu_solver(); @@ -211,8 +208,9 @@ bool local_search_t::do_fj_solve(solution_t& solution, const std::string& source) { if (time_limit == 0.) return solution.get_feasible(); + const bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); - timer_t timer(time_limit); + termination_checker_t timer(context.gpu_heur_loop, time_limit, *context.termination); const auto old_n_cstr_weights = in_fj.cstr_weights.size(); const auto expected_n_cstr_weights = static_cast(solution.problem_ptr->n_constraints); // in case this is the first time run, resize @@ -231,17 +229,24 @@ bool local_search_t::do_fj_solve(solution_t& solution, 1.); } } - auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); - auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); - for (auto& cpu_fj_ptr : ls_cpu_fj) { - auto& cpu_fj = *cpu_fj_ptr; - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - h_weights, - h_weights, - h_objective_weight, - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); + + { + auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); + auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); + for (auto& cpu_fj_ptr : ls_cpu_fj) { + auto& cpu_fj = *cpu_fj_ptr; + cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, + h_weights, + h_weights, + h_objective_weight, + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); + if (deterministic) { + cpu_fj.fj_cpu->work_units_elapsed = 0.0; + cpu_fj.fj_cpu->work_budget = time_limit; + } + } } auto solution_copy = solution; @@ -256,9 +261,10 @@ bool local_search_t::do_fj_solve(solution_t& solution, in_fj.settings.time_limit = timer.remaining_time(); in_fj.solve(solution); - // Stop CPU solver - for (auto& cpu_fj_ptr : ls_cpu_fj) { - cpu_fj_ptr->stop_cpu_solver(); + if (!deterministic) { + for (auto& cpu_fj_ptr : ls_cpu_fj) { + cpu_fj_ptr->stop_cpu_solver(); + } } auto gpu_fj_end = std::chrono::high_resolution_clock::now(); @@ -267,7 +273,6 @@ bool local_search_t::do_fj_solve(solution_t& solution, solution_t solution_cpu(*solution.problem_ptr); f_t best_cpu_obj = std::numeric_limits::max(); - // // Wait for CPU solver to finish for (auto& cpu_fj_ptr : ls_cpu_fj) { bool cpu_sol_found = cpu_fj_ptr->wait_for_cpu_solver(); if (cpu_sol_found) { @@ -313,8 +318,10 @@ bool local_search_t::do_fj_solve(solution_t& solution, } template -void local_search_t::generate_fast_solution(solution_t& solution, timer_t timer) +void local_search_t::generate_fast_solution(solution_t& solution, + termination_checker_t& timer) { + CUOPT_LOG_DEBUG("Running FJ fast sol"); thrust::fill(solution.handle_ptr->get_thrust_policy(), solution.assignment.begin(), solution.assignment.end(), @@ -325,8 +332,11 @@ void local_search_t::generate_fast_solution(solution_t& solu fj.settings.update_weights = true; fj.settings.feasibility_run = true; fj.settings.time_limit = std::min(30., timer.remaining_time()); - while (!context.diversity_manager_ptr->check_b_b_preemption() && !timer.check_time_limit()) { - timer_t constr_prop_timer = timer_t(std::min(timer.remaining_time(), 2.)); + while ((context.diversity_manager_ptr == nullptr || + !context.diversity_manager_ptr->check_b_b_preemption()) && + !timer.check_time_limit()) { + termination_checker_t constr_prop_timer = termination_checker_t( + context.gpu_heur_loop, std::min(timer.remaining_time(), 2.), *context.termination); // do constraint prop on lp optimal solution constraint_prop.apply_round(solution, 1., constr_prop_timer); if (solution.compute_feasibility()) { return; } @@ -343,7 +353,7 @@ void local_search_t::generate_fast_solution(solution_t& solu template bool local_search_t::run_local_search(solution_t& solution, const weight_t& weights, - timer_t timer, + termination_checker_t& timer, const ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("local search"); @@ -353,14 +363,14 @@ bool local_search_t::run_local_search(solution_t& solution, if (!solution.get_feasible()) { if (ls_config.at_least_one_parent_feasible) { fj_settings.time_limit = 0.5; - timer = timer_t(fj_settings.time_limit); } else { fj_settings.time_limit = 0.25; - timer = timer_t(fj_settings.time_limit); } } else { fj_settings.time_limit = std::min(1., timer.remaining_time()); } + // Limit this nested local-search pass without discarding the caller's remaining budget. + termination_checker_t local_timer(context.gpu_heur_loop, fj_settings.time_limit, timer); fj_settings.update_weights = false; fj_settings.feasibility_run = false; fj.set_fj_settings(fj_settings); @@ -375,11 +385,11 @@ bool local_search_t::run_local_search(solution_t& solution, } if (rd == ls_method_t::FJ_LINE_SEGMENT && lp_optimal_exists) { fj.copy_weights(weights, solution.handle_ptr); - is_feas = run_fj_line_segment(solution, timer, ls_config); + is_feas = run_fj_line_segment(solution, local_timer, ls_config); } else { fj.copy_weights(weights, solution.handle_ptr); - is_feas = run_fj_annealing(solution, timer, ls_config); - if (lp_optimal_exists) { is_feas = run_fj_line_segment(solution, timer, ls_config); } + is_feas = run_fj_annealing(solution, local_timer, ls_config); + if (lp_optimal_exists) { is_feas = run_fj_line_segment(solution, local_timer, ls_config); } } return is_feas; } @@ -387,8 +397,9 @@ bool local_search_t::run_local_search(solution_t& solution, template bool local_search_t::run_fj_until_timer(solution_t& solution, const weight_t& weights, - timer_t timer) + termination_checker_t& timer) { + CUOPT_LOG_DEBUG("Running FJ until timer"); bool is_feasible; fj.settings.n_of_minimums_for_exit = 1e6; fj.settings.mode = fj_mode_t::EXIT_NON_IMPROVING; @@ -405,7 +416,7 @@ bool local_search_t::run_fj_until_timer(solution_t& solution template bool local_search_t::run_fj_annealing(solution_t& solution, - timer_t timer, + termination_checker_t& timer, const ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("run_fj_annealing"); @@ -435,7 +446,7 @@ bool local_search_t::run_fj_annealing(solution_t& solution, template bool local_search_t::run_fj_line_segment(solution_t& solution, - timer_t timer, + termination_checker_t& timer, const ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("run_fj_line_segment"); @@ -458,7 +469,7 @@ bool local_search_t::run_fj_line_segment(solution_t& solutio template bool local_search_t::check_fj_on_lp_optimal(solution_t& solution, bool perturb, - timer_t timer) + termination_checker_t& timer) { raft::common::nvtx::range fun_scope("check_fj_on_lp_optimal"); if (lp_optimal_exists) { @@ -474,15 +485,21 @@ bool local_search_t::check_fj_on_lp_optimal(solution_t& solu solution.assign_random_within_bounds(perturbation_ratio); } cuopt_func_call(solution.test_variable_bounds(false)); - f_t lp_run_time_after_feasible = std::min(1., timer.remaining_time()); - timer_t bounds_prop_timer = timer_t(std::min(timer.remaining_time(), 10.)); + f_t lp_run_time_after_feasible = std::min(1., timer.remaining_time()); + termination_checker_t bounds_prop_timer = termination_checker_t( + context.gpu_heur_loop, std::min(timer.remaining_time(), 10.), *context.termination); bool is_feasible = constraint_prop.apply_round(solution, lp_run_time_after_feasible, bounds_prop_timer); if (!is_feasible) { const f_t lp_run_time = 2.; relaxed_lp_settings_t lp_settings; lp_settings.time_limit = std::min(lp_run_time, timer.remaining_time()); - lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance; + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } + lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance; run_lp_with_vars_fixed( *solution.problem_ptr, solution, solution.problem_ptr->integer_indices, lp_settings); } else { @@ -499,7 +516,8 @@ bool local_search_t::check_fj_on_lp_optimal(solution_t& solu } template -bool local_search_t::run_fj_on_zero(solution_t& solution, timer_t timer) +bool local_search_t::run_fj_on_zero(solution_t& solution, + termination_checker_t& timer) { raft::common::nvtx::range fun_scope("run_fj_on_zero"); thrust::fill(solution.handle_ptr->get_thrust_policy(), @@ -518,7 +536,7 @@ bool local_search_t::run_fj_on_zero(solution_t& solution, ti template bool local_search_t::run_staged_fp(solution_t& solution, - timer_t timer, + termination_checker_t& timer, population_t* population_ptr) { raft::common::nvtx::range fun_scope("run_staged_fp"); @@ -546,7 +564,8 @@ bool local_search_t::run_staged_fp(solution_t& solution, } CUOPT_LOG_DEBUG("Running staged FP from beginning it %d", i); fp.relax_general_integers(solution); - timer_t binary_timer(timer.remaining_time() / 3); + termination_checker_t binary_timer( + context.gpu_heur_loop, timer.remaining_time() / 3, *context.termination); i_t binary_it_counter = 0; for (; binary_it_counter < 100; ++binary_it_counter) { population_ptr->add_external_solutions_to_population(); @@ -658,7 +677,8 @@ void local_search_t::reset_alpha_and_save_solution( solution_t solution_copy(solution); solution_copy.problem_ptr = old_problem_ptr; solution_copy.resize_to_problem(); - population_ptr->add_solution(std::move(solution_copy)); + population_ptr->add_solution(std::move(solution_copy), + internals::mip_solution_origin_t::LOCAL_SEARCH); population_ptr->add_external_solutions_to_population(); if (!cutting_plane_added_for_active_run) { solution.problem_ptr = &problem_with_objective_cut; @@ -712,34 +732,54 @@ void local_search_t::reset_alpha_and_run_recombiners( template bool local_search_t::run_fp(solution_t& solution, - timer_t timer, - population_t* population_ptr) + termination_checker_t& timer, + population_t* population_ptr, + i_t n_fp_iterations) { raft::common::nvtx::range fun_scope("run_fp"); cuopt_assert(population_ptr != nullptr, "Population pointer must not be null"); - const i_t n_fp_iterations = 1000000; bool is_feasible = solution.compute_feasibility(); cutting_plane_added_for_active_run = is_feasible; double best_objective = is_feasible ? solution.get_objective() : std::numeric_limits::max(); rmm::device_uvector best_solution(solution.assignment, solution.handle_ptr->get_stream()); problem_t* old_problem_ptr = solution.problem_ptr; - fp.timer = timer_t(timer.remaining_time()); + fp.timer = + termination_checker_t(context.gpu_heur_loop, timer.remaining_time(), *context.termination); // if it has not been initialized yet, create a new problem and move it to the cut problem if (!problem_with_objective_cut.cutting_plane_added) { problem_with_objective_cut = std::move(problem_t(*old_problem_ptr)); + CUOPT_DETERMINISM_LOG("FP cut-problem clone: old_nv=%d old_nc=%d cut_nv=%d cut_nc=%d", + old_problem_ptr->n_variables, + old_problem_ptr->n_constraints, + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints); } if (is_feasible) { CUOPT_LOG_DEBUG("FP initial solution is feasible, adding cutting plane at obj"); f_t objective_cut = best_objective - std::max(std::abs(0.001 * best_objective), OBJECTIVE_EPSILON); + CUOPT_DETERMINISM_LOG( + "FP cut-problem add: cut_obj=%g cut_nv=%d cut_nc=%d cut_added=%d fj_w=%zu", + objective_cut, + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints, + (int)problem_with_objective_cut.cutting_plane_added, + fj.cstr_weights.size()); problem_with_objective_cut.add_cutting_plane_at_objective(objective_cut); + CUOPT_DETERMINISM_LOG("FP cut-problem post-add: cut_nv=%d cut_nc=%d", + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints); // Do the copy here for proper handling of the added constraints weight fj.copy_weights( population_ptr->weights, solution.handle_ptr, problem_with_objective_cut.n_constraints); solution.problem_ptr = &problem_with_objective_cut; solution.resize_to_problem(); resize_to_new_problem(); + CUOPT_DETERMINISM_LOG("FP cut-problem resize done: sol_assign=%zu sol_nv=%d sol_nc=%d", + solution.assignment.size(), + solution.problem_ptr->n_variables, + solution.problem_ptr->n_constraints); } i_t last_improved_iteration = 0; for (i_t i = 0; i < n_fp_iterations && !timer.check_time_limit(); ++i) { @@ -806,14 +846,44 @@ bool local_search_t::run_fp(solution_t& solution, } } } + CUOPT_DETERMINISM_LOG( + "FP teardown start: assign=%zu best=%zu curr_pb=%p old_pb=%p curr_nv=%d curr_nc=%d " + "old_nv=%d old_nc=%d prevp=%zu prevd=%zu fp_rem=%g parent_rem=%g gpu_work=%g " + "gpu_prod=%g cut_added=%d", + solution.assignment.size(), + best_solution.size(), + (void*)solution.problem_ptr, + (void*)old_problem_ptr, + solution.problem_ptr->n_variables, + solution.problem_ptr->n_constraints, + old_problem_ptr->n_variables, + old_problem_ptr->n_constraints, + solution.lp_state.prev_primal.size(), + solution.lp_state.prev_dual.size(), + fp.timer.remaining_time(), + timer.remaining_time(), + context.gpu_heur_loop.current_work(), + context.gpu_heur_loop.current_producer_work(), + (int)problem_with_objective_cut.cutting_plane_added); raft::copy(solution.assignment.data(), best_solution.data(), solution.assignment.size(), solution.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG("FP teardown post-copy: assign=%zu", solution.assignment.size()); solution.problem_ptr = old_problem_ptr; + CUOPT_DETERMINISM_LOG("FP teardown post-ptr: pb=%p nv=%d nc=%d", + (void*)solution.problem_ptr, + solution.problem_ptr->n_variables, + solution.problem_ptr->n_constraints); solution.resize_to_problem(); + CUOPT_DETERMINISM_LOG("FP teardown post-resize: assign=%zu prevp=%zu prevd=%zu", + solution.assignment.size(), + solution.lp_state.prev_primal.size(), + solution.lp_state.prev_dual.size()); resize_to_old_problem(old_problem_ptr); solution.handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG( + "FP teardown post-sync: hash=0x%x feas=%d", solution.get_hash(), (int)solution.get_feasible()); return is_feasible; } @@ -825,7 +895,7 @@ bool local_search_t::generate_solution(solution_t& solution, { raft::common::nvtx::range fun_scope("generate_solution"); cuopt_assert(population_ptr != nullptr, "Population pointer must not be null"); - timer_t timer(time_limit); + termination_checker_t timer(context.gpu_heur_loop, time_limit, *context.termination); auto n_vars = solution.problem_ptr->n_variables; auto n_binary_vars = solution.problem_ptr->get_n_binary_variables(); auto n_integer_vars = solution.problem_ptr->n_integer_vars; diff --git a/cpp/src/mip_heuristics/local_search/local_search.cuh b/cpp/src/mip_heuristics/local_search/local_search.cuh index 94493ebcb3..3323b5f621 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cuh +++ b/cpp/src/mip_heuristics/local_search/local_search.cuh @@ -13,13 +13,7 @@ #include #include #include -#include - -#include -#include -#include -#include -#include +#include namespace cuopt::linear_programming::dual_simplex { template @@ -58,32 +52,35 @@ class local_search_t { void start_cpufj_scratch_threads(population_t& population); void start_cpufj_lptopt_scratch_threads(population_t& population); void stop_cpufj_scratch_threads(); - void generate_fast_solution(solution_t& solution, timer_t timer); + void generate_fast_solution(solution_t& solution, termination_checker_t& timer); bool generate_solution(solution_t& solution, bool perturb, population_t* population_ptr, f_t time_limit = 300.); bool run_fj_until_timer(solution_t& solution, const weight_t& weights, - timer_t timer); + termination_checker_t& timer); bool run_local_search(solution_t& solution, const weight_t& weights, - timer_t timer, + termination_checker_t& timer, const ls_config_t& ls_config); bool run_fj_annealing(solution_t& solution, - timer_t timer, + termination_checker_t& timer, const ls_config_t& ls_config); bool run_fj_line_segment(solution_t& solution, - timer_t timer, + termination_checker_t& timer, const ls_config_t& ls_config); - bool run_fj_on_zero(solution_t& solution, timer_t timer); - bool check_fj_on_lp_optimal(solution_t& solution, bool perturb, timer_t timer); + bool run_fj_on_zero(solution_t& solution, termination_checker_t& timer); + bool check_fj_on_lp_optimal(solution_t& solution, + bool perturb, + termination_checker_t& timer); bool run_staged_fp(solution_t& solution, - timer_t timer, + termination_checker_t& timer, population_t* population_ptr); bool run_fp(solution_t& solution, - timer_t timer, - population_t* population_ptr = nullptr); + termination_checker_t& timer, + population_t* population_ptr = nullptr, + i_t n_fp_iterations = std::numeric_limits::max()); void resize_vectors(problem_t& problem, const raft::handle_t* handle_ptr); bool do_fj_solve(solution_t& solution, diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu index f3233cc8f4..0bddbe27e9 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu @@ -8,16 +8,114 @@ #include "bounds_repair.cuh" #include +#include #include #include +#include #include #include #include #include #include +#include + +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { +namespace { + +constexpr double bounds_repair_setup_base_work = 5e-4; +constexpr double bounds_repair_violation_base_work = 4e-4; +constexpr double bounds_repair_violation_nnz_work = 2e-6; +constexpr double bounds_repair_violation_constraint_work = 3e-6; +constexpr double bounds_repair_best_bounds_variable_work = 2e-6; +constexpr double bounds_repair_shift_base_work = 3e-4; +constexpr double bounds_repair_shift_row_entry_work = 3e-6; +constexpr double bounds_repair_shift_candidate_work = 8e-6; +constexpr double bounds_repair_shift_neighbor_entry_work = 3e-6; +constexpr double bounds_repair_shift_sort_work = 5e-6; +constexpr double bounds_repair_damage_base_work = 3e-4; +constexpr double bounds_repair_damage_neighbor_entry_work = 8e-6; +constexpr double bounds_repair_damage_sort_work = 5e-6; +constexpr double bounds_repair_move_base_work = 5e-5; +constexpr double bounds_repair_no_candidate_base_work = 4e-4; +constexpr double bounds_repair_cycle_penalty_work = 3e-4; + +template +double estimate_bounds_repair_violation_refresh_work(const problem_t& problem, + bool update_best_bounds) +{ + double estimate = bounds_repair_violation_base_work + + bounds_repair_violation_nnz_work * (double)problem.nnz + + bounds_repair_violation_constraint_work * (double)problem.n_constraints; + if (update_best_bounds) { + estimate += bounds_repair_best_bounds_variable_work * (double)problem.n_variables; + } + return estimate; +} + +template +double estimate_bounds_repair_setup_work(const problem_t& problem) +{ + return bounds_repair_setup_base_work + + estimate_bounds_repair_violation_refresh_work(problem, true); +} + +template +double estimate_bounds_repair_shift_work(const problem_t& problem, + i_t curr_cstr, + i_t n_candidates, + bool is_cycle) +{ + const auto stream = problem.handle_ptr->get_stream(); + const i_t cstr_begin = problem.offsets.element(curr_cstr, stream); + const i_t cstr_end = problem.offsets.element(curr_cstr + 1, stream); + const double row_nnz = cstr_end - cstr_begin; + const double avg_rev_degree = + problem.n_variables > 0 ? ((double)problem.nnz / (double)problem.n_variables) : 0.0; + const double sort_work = + n_candidates > 1 ? (double)n_candidates * std::log2((double)n_candidates) : 0.0; + double estimate = bounds_repair_shift_base_work + bounds_repair_shift_row_entry_work * row_nnz; + if (n_candidates == 0) { estimate = bounds_repair_no_candidate_base_work + estimate; } + estimate += bounds_repair_shift_candidate_work * (double)n_candidates; + estimate += bounds_repair_shift_neighbor_entry_work * (double)n_candidates * avg_rev_degree; + estimate += bounds_repair_shift_sort_work * sort_work; + if (is_cycle) { estimate += bounds_repair_cycle_penalty_work; } + return estimate; +} + +template +double estimate_bounds_repair_damage_work(const problem_t& problem, i_t n_candidates) +{ + if (n_candidates == 0) { return 0.0; } + const double avg_rev_degree = + problem.n_variables > 0 ? ((double)problem.nnz / (double)problem.n_variables) : 0.0; + const double sort_work = + n_candidates > 1 ? (double)n_candidates * std::log2((double)n_candidates) : 0.0; + return bounds_repair_damage_base_work + + bounds_repair_damage_neighbor_entry_work * (double)n_candidates * avg_rev_degree + + bounds_repair_damage_sort_work * sort_work; +} + +template +void record_estimated_work(timer_t& timer, double* total_estimated_work, double work) +{ + cuopt_assert(std::isfinite(work) && work >= 0.0, "Bounds repair work estimate must be finite"); + timer.record_work(work); + *total_estimated_work += work; +} + +} // namespace + template bounds_repair_t::bounds_repair_t(const problem_t& pb, bound_presolve_t& bound_presolve_) @@ -30,7 +128,8 @@ bounds_repair_t::bounds_repair_t(const problem_t& pb, violated_cstr_map(0, pb.handle_ptr->get_stream()), total_vio(pb.handle_ptr->get_stream()), gen(cuopt::seed_generator::get_seed()), - cycle_vector(MAX_CYCLE_SEQUENCE, -1) + cycle_vector(MAX_CYCLE_SEQUENCE, -1), + timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -68,8 +167,7 @@ f_t bounds_repair_t::get_ii_violation(problem_t& problem) min_act = bound_presolve.upd.min_activity.data(), max_act = bound_presolve.upd.max_activity.data(), cstr_violations_up = cstr_violations_up.data(), - cstr_violations_down = cstr_violations_down.data(), - total_vio = total_vio.data()] __device__(i_t cstr_idx) { + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) { f_t cnst_lb = pb_v.constraint_lower_bounds[cstr_idx]; f_t cnst_ub = pb_v.constraint_upper_bounds[cstr_idx]; f_t eps = get_cstr_tolerance( @@ -79,21 +177,31 @@ f_t bounds_repair_t::get_ii_violation(problem_t& problem) f_t violation = max(curr_cstr_violation_up, curr_cstr_violation_down); if (violation >= ROUNDOFF_TOLERANCE) { violated_cstr_map[cstr_idx] = 1; - atomicAdd(total_vio, violation); } else { violated_cstr_map[cstr_idx] = 0; } cstr_violations_up[cstr_idx] = curr_cstr_violation_up; cstr_violations_down[cstr_idx] = curr_cstr_violation_down; }); - auto iter = thrust::copy_if(handle_ptr->get_thrust_policy(), + auto iter = thrust::copy_if(handle_ptr->get_thrust_policy(), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + problem.n_constraints, violated_cstr_map.data(), violated_constraints.data(), cuda::std::identity{}); - h_n_violated_cstr = iter - violated_constraints.data(); - f_t total_violation = total_vio.value(handle_ptr->get_stream()); + h_n_violated_cstr = iter - violated_constraints.data(); + // Use deterministic reduction instead of non-deterministic atomicAdd + f_t total_violation = thrust::transform_reduce( + handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + problem.n_constraints, + [cstr_violations_up = cstr_violations_up.data(), + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) -> f_t { + auto violation = max(cstr_violations_up[cstr_idx], cstr_violations_down[cstr_idx]); + return violation >= ROUNDOFF_TOLERANCE ? violation : 0.; + }, + (f_t)0, + thrust::plus()); CUOPT_LOG_TRACE( "Repair: n_violated_cstr %d total_violation %f", h_n_violated_cstr, total_violation); return total_violation; @@ -103,10 +211,13 @@ template i_t bounds_repair_t::get_random_cstr() { std::uniform_int_distribution<> dist(0, h_n_violated_cstr - 1); - // Generate random number - i_t random_number = dist(gen); - i_t cstr_idx = violated_constraints.element(random_number, handle_ptr->get_stream()); + i_t random_index = dist(gen); + i_t cstr_idx = violated_constraints.element(random_index, handle_ptr->get_stream()); CUOPT_LOG_TRACE("Repair: selected random cstr %d", cstr_idx); + CUOPT_DETERMINISM_LOG("Repair cstr select: random_index=%d cstr=%d n_violated=%d", + random_index, + cstr_idx, + h_n_violated_cstr); return cstr_idx; } @@ -190,7 +301,14 @@ i_t bounds_repair_t::compute_best_shift(problem_t& problem, } }); handle_ptr->sync_stream(); - return candidates.n_candidates.value(handle_ptr->get_stream()); + i_t n_candidates = candidates.n_candidates.value(handle_ptr->get_stream()); + + // Sort by (variable_index, bound_shift) to ensure fully deterministic ordering + auto key_iter = thrust::make_zip_iterator( + thrust::make_tuple(candidates.variable_index.begin(), candidates.bound_shift.begin())); + thrust::sort(handle_ptr->get_thrust_policy(), key_iter, key_iter + n_candidates); + + return n_candidates; } template @@ -377,36 +495,100 @@ void bounds_repair_t::apply_move(problem_t& problem, template bool bounds_repair_t::repair_problem(problem_t& problem, problem_t& original_problem, - timer_t timer_, + termination_checker_t& timer_, const raft::handle_t* handle_ptr_) { CUOPT_LOG_DEBUG("Running bounds repair"); handle_ptr = handle_ptr_; timer = timer_; + cuopt_assert(timer.deterministic == problem.deterministic, + "Bounds repair timer/problem determinism mismatch"); resize(problem); reset(); best_violation = get_ii_violation(problem); curr_violation = best_violation; best_bounds.update_from(problem, handle_ptr); - i_t no_candidate_in_a_row = 0; - while (h_n_violated_cstr > 0) { + double total_estimated_work = 0.0; + i_t repair_iterations = 0; + if (timer.deterministic) { + const double setup_work = estimate_bounds_repair_setup_work(problem); + record_estimated_work(timer, &total_estimated_work, setup_work); + CUOPT_DETERMINISM_LOG( + "Repair entry: pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x n_violated=%d " + "best_violation=%.6f timer_rem=%.6f total_work=%.6f setup_work=%.6f", + problem.get_fingerprint(), + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr), + handle_ptr->get_stream()), + h_n_violated_cstr, + best_violation, + timer.remaining_time(), + total_estimated_work, + setup_work); + } + i_t no_candidate_in_a_row = 0; + [[maybe_unused]] const char* exit_reason = "FEASIBLE"; + // TODO: do this better + i_t iter_limit = std::numeric_limits::max(); + if (timer.deterministic) { iter_limit = 20; } + while (h_n_violated_cstr > 0 && iter_limit-- > 0) { + repair_iterations++; CUOPT_LOG_TRACE("Bounds repair loop: n_violated %d best_violation %f curr_violation %f", h_n_violated_cstr, best_violation, curr_violation); - if (timer.check_time_limit()) { break; } + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "Repair iter entry: iter=%d pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x " + "n_violated=%d best_violation=%.6f curr_violation=%.6f timer_rem=%.6f total_work=%.6f", + repair_iterations, + problem.get_fingerprint(), + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr), + handle_ptr->get_stream()), + h_n_violated_cstr, + best_violation, + curr_violation, + timer.remaining_time(), + total_estimated_work); + } + if (timer.check_time_limit()) { + exit_reason = "TIME_LIMIT"; + break; + } i_t curr_cstr = get_random_cstr(); // best way would be to check a variable cycle, but this is easier and more performant bool is_cycle = detect_cycle(curr_cstr); if (is_cycle) { CUOPT_LOG_DEBUG("Repair: cycle detected at cstr %d", curr_cstr); } // in parallel compute the best shift and best respective damage - i_t n_candidates = compute_best_shift(problem, original_problem, curr_cstr); + i_t n_candidates = compute_best_shift(problem, original_problem, curr_cstr); + double shift_work = 0.0; + if (timer.deterministic) { + shift_work = estimate_bounds_repair_shift_work(problem, curr_cstr, n_candidates, is_cycle); + record_estimated_work(timer, &total_estimated_work, shift_work); + CUOPT_DETERMINISM_LOG( + "Repair iter shift: iter=%d curr_cstr=%d cycle=%d n_candidates=%d cand_var_hash=0x%x " + "cand_shift_hash=0x%x singleton_moved=%d shift_work=%.6f timer_rem=%.6f total_work=%.6f", + repair_iterations, + curr_cstr, + (int)is_cycle, + n_candidates, + detail::compute_hash(make_span(candidates.variable_index, 0, n_candidates), + handle_ptr->get_stream()), + detail::compute_hash(make_span(candidates.bound_shift, 0, n_candidates), + handle_ptr->get_stream()), + (int)candidates.at_least_one_singleton_moved.value(handle_ptr->get_stream()), + shift_work, + timer.remaining_time(), + total_estimated_work); + } // if no candidate is there continue with another constraint if (n_candidates == 0) { CUOPT_LOG_DEBUG("Repair: no candidate var found for cstr %d", curr_cstr); if (no_candidate_in_a_row++ == 10 || h_n_violated_cstr == 1) { CUOPT_LOG_DEBUG("Repair: no candidate var found on last violated constraint %d. Exiting...", curr_cstr); + exit_reason = "NO_CANDIDATE"; break; } continue; @@ -418,17 +600,36 @@ bool bounds_repair_t::repair_problem(problem_t& problem, // get the best damage i_t best_cstr_delta = candidates.cstr_delta.front_element(handle_ptr->get_stream()); f_t best_damage = candidates.damage.front_element(handle_ptr->get_stream()); + double damage_work = 0.0; + if (timer.deterministic) { + damage_work = estimate_bounds_repair_damage_work(problem, n_candidates); + record_estimated_work(timer, &total_estimated_work, damage_work); + CUOPT_DETERMINISM_LOG( + "Repair iter damage: iter=%d curr_cstr=%d cand_cdelta_hash=0x%x cand_damage_hash=0x%x " + "best_cstr_delta=%d best_damage=%.6f damage_work=%.6f timer_rem=%.6f total_work=%.6f", + repair_iterations, + curr_cstr, + detail::compute_hash(make_span(candidates.cstr_delta, 0, n_candidates), + handle_ptr->get_stream()), + detail::compute_hash(make_span(candidates.damage, 0, n_candidates), + handle_ptr->get_stream()), + best_cstr_delta, + best_damage, + damage_work, + timer.remaining_time(), + total_estimated_work); + } CUOPT_LOG_TRACE( "Repair: best_cstr_delta value %d best_damage %f", best_cstr_delta, best_damage); i_t best_move_idx; - // if the best damage is positive and we are within the prop (paper uses 0.75) - if ((best_cstr_delta > 0 && rand_double(0, 1, gen) < p) || is_cycle) { - // pick a random move from the candidate list + i_t n_of_eligible_candidates = -1; + + const double rand_u01 = rand_double(0, 1, gen); + const bool took_random_branch = (best_cstr_delta > 0 && rand_u01 < p) || is_cycle; + if (took_random_branch) { best_move_idx = get_random_idx(n_candidates); } else { - // filter the moves with best_damage(it can be zero or not) and then pick a candidate among - // them - i_t n_of_eligible_candidates = + n_of_eligible_candidates = find_cutoff_index(candidates, best_cstr_delta, best_damage, n_candidates); cuopt_assert(n_of_eligible_candidates > 0, ""); CUOPT_LOG_TRACE("n_of_eligible_candidates %d", n_of_eligible_candidates); @@ -440,22 +641,79 @@ bool bounds_repair_t::repair_problem(problem_t& problem, candidates.bound_shift.element(best_move_idx, handle_ptr->get_stream()), candidates.cstr_delta.element(best_move_idx, handle_ptr->get_stream()), candidates.damage.element(best_move_idx, handle_ptr->get_stream())); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "Repair iter select: iter=%d cycle=%d rand_u01=%.12f took_random=%d " + "cutoff_idx=%d n_eligible=%d chosen_idx=%d chosen_var=%d chosen_shift=%.6f " + "chosen_cdelta=%d chosen_damage=%.6f", + repair_iterations, + (int)is_cycle, + rand_u01, + (int)took_random_branch, + (int)(took_random_branch ? -1 : n_of_eligible_candidates), + (int)(took_random_branch ? n_candidates : n_of_eligible_candidates), + best_move_idx, + candidates.variable_index.element(best_move_idx, handle_ptr->get_stream()), + candidates.bound_shift.element(best_move_idx, handle_ptr->get_stream()), + candidates.cstr_delta.element(best_move_idx, handle_ptr->get_stream()), + candidates.damage.element(best_move_idx, handle_ptr->get_stream())); + } apply_move(problem, original_problem, best_move_idx); reset(); // TODO we might optimize this to only calculate the changed constraints - curr_violation = get_ii_violation(problem); + curr_violation = get_ii_violation(problem); + const bool improved_violation = curr_violation < best_violation; + double refresh_work = 0.0; + if (timer.deterministic) { + refresh_work = bounds_repair_move_base_work + + estimate_bounds_repair_violation_refresh_work(problem, improved_violation); + record_estimated_work(timer, &total_estimated_work, refresh_work); + CUOPT_DETERMINISM_LOG( + "Repair iter post: iter=%d pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x " + "n_violated=%d curr_violation=%.6f improved=%d refresh_work=%.6f total_work=%.6f " + "timer_rem=%.6f", + repair_iterations, + problem.get_fingerprint(), + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr), + handle_ptr->get_stream()), + h_n_violated_cstr, + curr_violation, + (int)improved_violation, + refresh_work, + total_estimated_work, + timer.remaining_time()); + CUOPT_DETERMINISM_LOG( + "Repair iter work: cstr=%d candidates=%d cycle=%d improved=%d total=%.6f", + curr_cstr, + n_candidates, + (int)is_cycle, + (int)improved_violation, + total_estimated_work); + } - if (curr_violation < best_violation) { + if (improved_violation) { best_violation = curr_violation; // update best bounds best_bounds.update_from(problem, handle_ptr); } } - // fill the problem with the best bounds + if (h_n_violated_cstr > 0 && iter_limit <= 0) { exit_reason = "ITER_LIMIT"; } bool feasible = h_n_violated_cstr == 0; - // copy best bounds into problem best_bounds.update_to(problem, handle_ptr); CUOPT_LOG_DEBUG("Repair: returning with feas: %d vio %f", feasible, best_violation); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "Repair exit: reason=%s iters=%d feasible=%d n_violated=%d best_violation=%.6f " + "total_work=%.6f timer_rem=%.6f", + exit_reason, + repair_iterations, + (int)feasible, + h_n_violated_cstr, + best_violation, + total_estimated_work, + timer.remaining_time()); + } return feasible; } diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh index 29161c5d25..5991366767 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh @@ -13,6 +13,9 @@ #include #include +#include +#include + namespace cuopt::linear_programming::detail { // from the paper, probability of choosing random candidate= noise parameter @@ -120,7 +123,7 @@ class bounds_repair_t { void compute_damages(problem_t& problem, i_t n_candidates); bool repair_problem(problem_t& problem, problem_t& original_problem, - timer_t timer_, + termination_checker_t& timer_, const raft::handle_t* handle_ptr_); void apply_move(problem_t& problem, problem_t& original_problem, @@ -144,7 +147,7 @@ class bounds_repair_t { i_t h_n_violated_cstr; const raft::handle_t* handle_ptr; std::mt19937 gen; - timer_t timer{0.}; + termination_checker_t timer; std::vector cycle_vector; i_t cycle_write_pos = 0; }; diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu index 8db4d7ae85..5e68482e6f 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu @@ -5,6 +5,7 @@ */ /* clang-format on */ +#include #include #include #include @@ -16,8 +17,12 @@ #include #include #include +#include #include #include +#include + +#include namespace cuopt::linear_programming::detail { @@ -39,7 +44,8 @@ constraint_prop_t::constraint_prop_t(mip_solver_context_t& c ub_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()), assignment_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()), - rng(cuopt::seed_generator::get_seed(), 0, 0) + rng(cuopt::seed_generator::get_seed(), 0, 0), + max_timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -725,6 +731,10 @@ void constraint_prop_t::update_host_assignment(const solution_tn_variables, sol.handle_ptr->get_stream()); + sol.handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG( + "update_host_assignment: device_hash=0x%x", + detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream())); } template @@ -755,7 +765,7 @@ void constraint_prop_t::restore_original_bounds_on_unfixed( template bool constraint_prop_t::run_repair_procedure(problem_t& problem, problem_t& original_problem, - timer_t& timer, + termination_checker_t& timer, const raft::handle_t* handle_ptr) { // select the first probing value @@ -765,9 +775,14 @@ bool constraint_prop_t::run_repair_procedure(problem_t& prob repair_stats.repair_attempts++; f_t repair_start_time = timer.remaining_time(); i_t n_of_repairs_needed_for_feasible = 0; + // TODO: do this better + i_t iter_limit = std::numeric_limits::max(); + if ((this->context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + iter_limit = 100; + } do { n_of_repairs_needed_for_feasible++; - if (timer.check_time_limit()) { + if (timer.check_time_limit() || iter_limit-- <= 0) { CUOPT_LOG_DEBUG("Time limit is reached in repair loop!"); f_t repair_end_time = timer.remaining_time(); repair_stats.total_time_spent_on_repair += repair_start_time - repair_end_time; @@ -775,8 +790,24 @@ bool constraint_prop_t::run_repair_procedure(problem_t& prob } repair_stats.total_repair_loops++; collapse_crossing_bounds(problem, original_problem, handle_ptr); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "run_repair_procedure pre-repair: loop=%d bounds_hash=0x%x infeas_count=%d timer_rem=%.6f", + n_of_repairs_needed_for_feasible, + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + bounds_update.infeas_constraints_count, + timer.remaining_time()); + } bool bounds_repaired = bounds_repair.repair_problem(problem, original_problem, timer, handle_ptr); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "run_repair_procedure post-repair: loop=%d repaired=%d bounds_hash=0x%x timer_rem=%.6f", + n_of_repairs_needed_for_feasible, + (int)bounds_repaired, + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + timer.remaining_time()); + } if (bounds_repaired) { repair_stats.intermediate_repair_success++; CUOPT_LOG_DEBUG("Bounds repair success, running bounds prop to verify feasibility!"); @@ -841,11 +872,15 @@ bool constraint_prop_t::find_integer( solution_t& sol, solution_t& orig_sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_config) { using crit_t = termination_criterion_t; auto& unset_integer_vars = unset_vars; + CUOPT_DETERMINISM_LOG("find_integer entry: seed=%lld hash=0x%x rem=%.6f", + (long long)cuopt::seed_generator::peek_seed(), + sol.get_hash(), + timer.remaining_time()); std::mt19937 rng(cuopt::seed_generator::get_seed()); lb_restore.resize(sol.problem_ptr->n_variables, sol.handle_ptr->get_stream()); ub_restore.resize(sol.problem_ptr->n_variables, sol.handle_ptr->get_stream()); @@ -871,6 +906,7 @@ bool constraint_prop_t::find_integer( sol.problem_ptr->integer_indices.data(), sol.problem_ptr->n_integer_vars, sol.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG("sol hash 0x%x", sol.get_hash()); } else { find_unset_integer_vars(sol, unset_integer_vars); sort_by_frac(sol, make_span(unset_integer_vars)); @@ -895,16 +931,17 @@ bool constraint_prop_t::find_integer( set_bounds_on_fixed_vars(sol); } - CUOPT_LOG_DEBUG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size()); + CUOPT_DETERMINISM_LOG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size()); if (unset_integer_vars.size() == 0) { - CUOPT_LOG_DEBUG("No integer variables provided in the bounds prop rounding"); + CUOPT_DETERMINISM_LOG("No integer variables provided in the bounds prop rounding"); expand_device_copy(orig_sol.assignment, sol.assignment, sol.handle_ptr->get_stream()); cuopt_func_call(orig_sol.test_variable_bounds()); return orig_sol.compute_feasibility(); } // this is needed for the sort inside of the loop bool problem_ii = is_problem_ii(*sol.problem_ptr); - // if the problem is ii, run the bounds prop in the beginning + CUOPT_DETERMINISM_LOG("is problem ii %d", problem_ii); + // if the problem is ii, run the bounds prop in the beginning if (problem_ii) { bool bounds_repaired = bounds_repair.repair_problem(*sol.problem_ptr, *orig_sol.problem_ptr, timer, sol.handle_ptr); @@ -925,11 +962,16 @@ bool constraint_prop_t::find_integer( sort_by_interval_and_frac(sol, make_span(unset_integer_vars), rng); } set_host_bounds(sol); + CUOPT_DETERMINISM_LOG("find_integer pre-loop: seed=%lld hash=0x%x", + (long long)cuopt::seed_generator::peek_seed(), + sol.get_hash()); size_t set_count = 0; bool timeout_happened = false; i_t n_failed_repair_iterations = 0; while (set_count < unset_integer_vars.size()) { - CUOPT_LOG_TRACE("n_set_vars %d vars to set %lu", set_count, unset_integer_vars.size()); + CUOPT_DETERMINISM_LOG("n_set_vars %d vars to set %lu", set_count, unset_integer_vars.size()); + CUOPT_DETERMINISM_LOG("unset_integer_vars size %lu", unset_integer_vars.size()); + const size_t set_count_before = set_count; update_host_assignment(sol); if (max_timer.check_time_limit()) { CUOPT_LOG_DEBUG("Second time limit is reached returning nearest rounding!"); @@ -954,7 +996,8 @@ bool constraint_prop_t::find_integer( bounds_prop_interval = 1; } } - i_t n_vars_to_set = recovery_mode ? 1 : bounds_prop_interval; + i_t n_vars_to_set = recovery_mode ? 1 : bounds_prop_interval; + const bool did_sort = n_vars_to_set != 1; // if we are not at the last stage or if we are in recovery mode, don't sort if (n_vars_to_set != 1) { sort_by_implied_slack_consumption( @@ -965,17 +1008,63 @@ bool constraint_prop_t::find_integer( unset_integer_vars.data() + set_count, n_vars_to_set, sol.handle_ptr->get_stream()); + sol.handle_ptr->sync_stream(); auto var_probe_vals = generate_bulk_rounding_vector(sol, orig_sol, host_vars_to_set, probing_config); + if (timer.deterministic) { + const auto& vids = std::get<0>(var_probe_vals); + const auto& fp = std::get<1>(var_probe_vals); + const auto& sp = std::get<2>(var_probe_vals); + std::string probe_str; + for (size_t k = 0; k < std::min(vids.size(), (size_t)8); ++k) { + char buf[128]; + snprintf(buf, sizeof(buf), " (%d,%.4f,%.4f)", vids[k], fp[k], sp[k]); + probe_str += buf; + } + CUOPT_DETERMINISM_LOG( + "find_integer loop: set_count=%zu n_vars_to_set=%d seed=%lld probes=[%s]", + set_count, + n_vars_to_set, + (long long)cuopt::seed_generator::peek_seed(), + probe_str.c_str()); + } probe( sol, orig_sol.problem_ptr, var_probe_vals, &set_count, unset_integer_vars, probing_config); + CUOPT_DETERMINISM_LOG("find_integer post-probe: seed=%lld set_count=%zu hash=0x%x", + (long long)cuopt::seed_generator::peek_seed(), + set_count, + sol.get_hash()); + [[maybe_unused]] bool repair_attempted = false; + bool bounds_repaired = false; + i_t n_fixed_vars = 0; if (!(n_failed_repair_iterations >= max_n_failed_repair_iterations) && rounding_ii && !timeout_happened) { - timer_t repair_timer{std::min(timer.remaining_time() / 5, timer.elapsed_time() / 3)}; + // timer_t repair_timer{std::min(timer.remaining_time() / 5, timer.elapsed_time() / 3)}; + termination_checker_t repair_timer( + context.gpu_heur_loop, max_timer.remaining_time() / 5, max_timer); save_bounds(sol); - // update bounds and run repair procedure + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "find_integer pre-repair: bounds_hash=0x%x assignment_hash=0x%x infeas_count=%d " + "timer_rem=%.6f", + detail::compute_hash(make_span(sol.problem_ptr->variable_bounds), + sol.handle_ptr->get_stream()), + detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()), + bounds_update.infeas_constraints_count, + timer.remaining_time()); + } bool bounds_repaired = run_repair_procedure(*sol.problem_ptr, *orig_sol.problem_ptr, repair_timer, sol.handle_ptr); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "find_integer post-repair: repaired=%d bounds_hash=0x%x assignment_hash=0x%x " + "timer_rem=%.6f", + (int)bounds_repaired, + detail::compute_hash(make_span(sol.problem_ptr->variable_bounds), + sol.handle_ptr->get_stream()), + detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()), + timer.remaining_time()); + } if (!bounds_repaired) { restore_bounds(sol); n_failed_repair_iterations++; @@ -998,7 +1087,7 @@ bool constraint_prop_t::find_integer( make_span(sol.problem_ptr->variable_bounds), make_span(orig_sol.problem_ptr->variable_bounds), make_span(sol.assignment)}); - i_t n_fixed_vars = (iter - (unset_vars.begin() + set_count)); + n_fixed_vars = (iter - (unset_vars.begin() + set_count)); CUOPT_LOG_TRACE("After repair procedure, number of additional fixed vars %d", n_fixed_vars); set_count += n_fixed_vars; } @@ -1026,7 +1115,7 @@ bool constraint_prop_t::find_integer( // which is the unchanged problem bounds multi_probe.update_host_bounds(sol.handle_ptr, make_span(sol.problem_ptr->variable_bounds)); } - CUOPT_LOG_DEBUG( + CUOPT_DETERMINISM_LOG( "Bounds propagation rounding end: ii constraint count first buffer %d, second buffer %d", multi_probe.infeas_constraints_count_0, multi_probe.infeas_constraints_count_1); @@ -1038,7 +1127,12 @@ bool constraint_prop_t::find_integer( multi_probe.infeas_constraints_count_1 == 0) && !timeout_happened && lp_run_time_after_feasible > 0) { relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = lp_run_time_after_feasible; + lp_settings.time_limit = lp_run_time_after_feasible; + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } lp_settings.tolerance = orig_sol.problem_ptr->tolerances.absolute_tolerance; lp_settings.save_state = false; lp_settings.return_first_feasible = true; @@ -1050,6 +1144,10 @@ bool constraint_prop_t::find_integer( } bool res_feasible = orig_sol.compute_feasibility(); orig_sol.handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG("find_integer exit: seed=%lld feasible=%d hash=0x%x", + (long long)cuopt::seed_generator::peek_seed(), + (int)res_feasible, + orig_sol.get_hash()); return res_feasible; } @@ -1057,11 +1155,13 @@ template bool constraint_prop_t::apply_round( solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_config) { raft::common::nvtx::range fun_scope("constraint prop round"); - max_timer = timer_t{max_time_for_bounds_prop}; + + sol.compute_feasibility(); + max_timer = termination_checker_t{context.gpu_heur_loop, max_time_for_bounds_prop, timer}; if (check_brute_force_rounding(sol)) { return true; } recovery_mode = false; rounding_ii = false; @@ -1076,9 +1176,9 @@ bool constraint_prop_t::apply_round( f_t bounds_prop_end_time = max_timer.remaining_time(); repair_stats.total_time_spent_on_bounds_prop += bounds_prop_start_time - bounds_prop_end_time; - CUOPT_LOG_DEBUG( - "repair_success %lu repair_attempts %lu intermediate_repair_success %lu total_repair_loops %lu " - "total_time_spent_on_repair %f total_time_spent_bounds_prop_after_repair %f " + CUOPT_DETERMINISM_LOG( + "repair_success %lu repair_attempts %lu intermediate_repair_success %lu total_repair_loops" + "%lu total_time_spent_on_repair %f total_time_spent_bounds_prop_after_repair %f " "total_time_spent_on_bounds_prop %f", repair_stats.repair_success, repair_stats.repair_attempts, @@ -1229,6 +1329,13 @@ bool constraint_prop_t::probe( } selected_update = 0; if (first_bounds_update_ii) { selected_update = 1; } + CUOPT_DETERMINISM_LOG( + "probe result: infeas_0=%d infeas_1=%d selected_update=%d recovery=%d rounding_ii=%d", + multi_probe.infeas_constraints_count_0, + multi_probe.infeas_constraints_count_1, + selected_update, + (int)recovery_mode, + (int)rounding_ii); // if we are doing single rounding if (probing_config.has_value() && probing_config.value().get().use_balanced_probing) { cuopt_assert(std::get<0>(var_probe_vals).size() == 1, diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh index 2c609228e8..fbdd88c134 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh @@ -43,7 +43,7 @@ struct constraint_prop_t { constraint_prop_t(mip_solver_context_t& context); bool apply_round(solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_config = std::nullopt); void sort_by_implied_slack_consumption(solution_t& sol, @@ -56,7 +56,7 @@ struct constraint_prop_t { bool find_integer(solution_t& sol, solution_t& orig_sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_config = std::nullopt); void find_set_integer_vars(solution_t& sol, rmm::device_uvector& set_vars); @@ -121,7 +121,7 @@ struct constraint_prop_t { const raft::handle_t* handle_ptr); bool run_repair_procedure(problem_t& problem, problem_t& original_problem, - timer_t& timer, + termination_checker_t& timer, const raft::handle_t* handle_ptr); bool handle_fixed_vars( solution_t& sol, @@ -149,7 +149,7 @@ struct constraint_prop_t { i_t bounds_prop_interval = 1; i_t n_iter_in_recovery = 0; i_t max_n_failed_repair_iterations = 1; - timer_t max_timer{0.}; + termination_checker_t max_timer; bool use_probing_cache = true; static repair_stats_t repair_stats; bool single_rounding_only = false; diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu index 7d074aea5e..65d0b46993 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu @@ -8,8 +8,10 @@ #include "lb_bounds_repair.cuh" #include +#include #include #include +#include #include #include #include @@ -26,7 +28,8 @@ lb_bounds_repair_t::lb_bounds_repair_t(const raft::handle_t* handle_pt violated_cstr_map(0, handle_ptr->get_stream()), total_vio(handle_ptr->get_stream()), gen(cuopt::seed_generator::get_seed()), - cycle_vector(MAX_CYCLE_SEQUENCE, -1) + cycle_vector(MAX_CYCLE_SEQUENCE, -1), + timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -68,8 +71,7 @@ std::tuple lb_bounds_repair_t::get_ii_violation( constraint_upper_bounds = problem.constraint_upper_bounds, cnst_slack = make_span_2(lb_bound_presolve.cnst_slack), cstr_violations_up = cstr_violations_up.data(), - cstr_violations_down = cstr_violations_down.data(), - total_vio = total_vio.data()] __device__(i_t cstr_idx) { + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) { f_t cnst_lb = constraint_lower_bounds[cstr_idx]; f_t cnst_ub = constraint_upper_bounds[cstr_idx]; f_t2 slack = cnst_slack[cstr_idx]; @@ -80,7 +82,6 @@ std::tuple lb_bounds_repair_t::get_ii_violation( f_t violation = max(curr_cstr_violation_up, curr_cstr_violation_down); if (violation >= ROUNDOFF_TOLERANCE) { violated_cstr_map[cstr_idx] = 1; - atomicAdd(total_vio, violation); } else { violated_cstr_map[cstr_idx] = 0; } @@ -94,7 +95,18 @@ std::tuple lb_bounds_repair_t::get_ii_violation( violated_constraints.data(), cuda::std::identity{}); i_t n_violated_cstr = iter - violated_constraints.data(); - f_t total_violation = total_vio.value(handle_ptr->get_stream()); + // Use deterministic reduction instead of non-deterministic atomicAdd + f_t total_violation = thrust::transform_reduce( + handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + problem.n_constraints, + [cstr_violations_up = cstr_violations_up.data(), + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) -> f_t { + auto violation = max(cstr_violations_up[cstr_idx], cstr_violations_down[cstr_idx]); + return violation >= ROUNDOFF_TOLERANCE ? violation : 0.; + }, + (f_t)0, + thrust::plus()); CUOPT_LOG_TRACE( "Repair: n_violated_cstr %d total_violation %f", n_violated_cstr, total_violation); return std::make_tuple(total_violation, n_violated_cstr); @@ -397,10 +409,11 @@ bool lb_bounds_repair_t::repair_problem( load_balanced_problem_t* problem, load_balanced_bounds_presolve_t& lb_bound_presolve, problem_t& original_problem, - timer_t timer_, + termination_checker_t& timer_, const raft::handle_t* handle_ptr_) { - CUOPT_LOG_DEBUG("Running bounds repair"); + nvtx::range fun_scope("LB repair_problem"); + CUOPT_LOG_DEBUG("LB Running bounds repair"); handle_ptr = handle_ptr_; timer = timer_; resize(*problem); diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh index 0b549c684d..59368cacd8 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh @@ -58,7 +58,7 @@ class lb_bounds_repair_t { bool repair_problem(load_balanced_problem_t* problem, load_balanced_bounds_presolve_t& lb_bound_presolve, problem_t& original_problem, - timer_t timer_, + termination_checker_t& timer_, const raft::handle_t* handle_ptr_); void apply_move(load_balanced_problem_t* problem, problem_t& original_problem, @@ -82,7 +82,7 @@ class lb_bounds_repair_t { i_t h_n_violated_cstr; const raft::handle_t* handle_ptr; std::mt19937 gen; - timer_t timer{0.}; + termination_checker_t timer; std::vector cycle_vector; i_t cycle_write_pos = 0; }; diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu index bb72834ab4..c93c379eed 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu @@ -33,7 +33,8 @@ lb_constraint_prop_t::lb_constraint_prop_t(mip_solver_context_thandle_ptr->get_stream()), assignment_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()), - rng(cuopt::seed_generator::get_seed(), 0, 0) + rng(cuopt::seed_generator::get_seed(), 0, 0), + max_timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -700,14 +701,15 @@ template bool lb_constraint_prop_t::apply_round( solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_candidates) { raft::common::nvtx::range fun_scope("constraint prop round"); // this is second timer that can continue but without recovery mode const f_t max_time_for_bounds_prop = 5.; - max_timer = timer_t{max_time_for_bounds_prop}; + max_timer = + termination_checker_t{context.gpu_heur_loop, max_time_for_bounds_prop, *context.termination}; if (check_brute_force_rounding(sol)) { return true; } recovery_mode = false; rounding_ii = false; diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh index 20e28e7cb9..d0a763f610 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh @@ -23,7 +23,7 @@ struct lb_constraint_prop_t { bool apply_round( solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_candidates = std::nullopt); void sort_by_implied_slack_consumption( problem_t& original_problem, @@ -40,7 +40,7 @@ struct lb_constraint_prop_t { load_balanced_bounds_presolve_t& lb_bounds_update, solution_t& orig_sol, f_t lp_run_time_after_feasible, - timer_t& timer, + termination_checker_t& timer, std::optional>> probing_candidates); std::tuple probing_values( load_balanced_bounds_presolve_t& lb_bounds_update, @@ -83,7 +83,7 @@ struct lb_constraint_prop_t { bool run_repair_procedure(load_balanced_problem_t* problem, load_balanced_bounds_presolve_t& lb_bounds_update, problem_t& original_problem, - timer_t& timer, + termination_checker_t& timer, const raft::handle_t* handle_ptr); mip_solver_context_t& context; @@ -100,7 +100,7 @@ struct lb_constraint_prop_t { bool rounding_ii = false; i_t bounds_prop_interval = 1; i_t n_iter_in_recovery = 0; - timer_t max_timer{0.}; + termination_checker_t max_timer; bool use_probing_cache = true; size_t repair_attempts = 0; diff --git a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu index 4f3a015a6c..9a2bf317b7 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu @@ -8,8 +8,10 @@ #include "simple_rounding.cuh" #include "simple_rounding_kernels.cuh" +#include #include #include +#include #include #include @@ -35,6 +37,8 @@ bool check_brute_force_rounding(solution_t& solution) if (n_integers_to_round == 0) { return solution.compute_feasibility(); } constexpr i_t brute_force_rounding_threshold = 8; if (n_integers_to_round <= brute_force_rounding_threshold) { + CUOPT_DETERMINISM_LOG( + "Brute-force rounding: n_to_round=%d hash=0x%x", n_integers_to_round, solution.get_hash()); solution.compute_constraints(); i_t n_configs = pow(2, n_integers_to_round); i_t n_blocks = (n_configs + TPB - 1) / TPB; @@ -42,7 +46,8 @@ bool check_brute_force_rounding(solution_t& solution) rmm::device_uvector var_map(n_integers_to_round, solution.handle_ptr->get_stream()); rmm::device_uvector constraint_buf(n_configs * solution.problem_ptr->n_constraints, solution.handle_ptr->get_stream()); - rmm::device_scalar best_config(-1, solution.handle_ptr->get_stream()); + rmm::device_scalar best_config(std::numeric_limits::max(), + solution.handle_ptr->get_stream()); thrust::copy_if( solution.handle_ptr->get_thrust_policy(), solution.problem_ptr->integer_indices.begin(), @@ -58,7 +63,13 @@ bool check_brute_force_rounding(solution_t& solution) cuopt::make_span(var_map), cuopt::make_span(constraint_buf), best_config.data()); - if (best_config.value(solution.handle_ptr->get_stream()) != -1) { + i_t best_config_val = best_config.value(solution.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG( + "Brute-force rounding: best_config=%d (max=%d) var_map_hash=0x%x", + best_config_val, + (int)std::numeric_limits::max(), + detail::compute_hash(make_span(var_map), solution.handle_ptr->get_stream())); + if (best_config_val != std::numeric_limits::max()) { CUOPT_LOG_DEBUG("Feasible found during brute force rounding!"); // apply the feasible rounding apply_feasible_rounding_kernel<<<1, TPB, 0, solution.handle_ptr->get_stream()>>>( diff --git a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh index 2edca8fb08..a0b8468ea7 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh @@ -131,7 +131,7 @@ __global__ void brute_force_check_kernel(typename solution_t::view_t s __shared__ i_t shbuf[raft::WarpSize]; i_t total_feasible = raft::blockReduce(th_feasible_count, (char*)shbuf); if (threadIdx.x == 0) { - if (total_feasible == solution.problem.n_constraints) { atomicExch(best_config, config); } + if (total_feasible == solution.problem.n_constraints) { atomicMin(best_config, config); } } } diff --git a/cpp/src/mip_heuristics/mip_constants.hpp b/cpp/src/mip_heuristics/mip_constants.hpp index 47d3d22de4..94b511da60 100644 --- a/cpp/src/mip_heuristics/mip_constants.hpp +++ b/cpp/src/mip_heuristics/mip_constants.hpp @@ -13,3 +13,7 @@ #define MIP_INSTANTIATE_DOUBLE CUOPT_INSTANTIATE_DOUBLE #define PDLP_INSTANTIATE_FLOAT 1 + +#define BB_BASE_WORK_SCALE 1.0 +#define GPU_HEUR_BASE_WORK_SCALE 0.4 +#define CPUFJ_BASE_WORK_SCALE 1.0 diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu index d78f8beb16..812b23956e 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -167,10 +168,15 @@ void bound_presolve_t::set_bounds( template termination_criterion_t bound_presolve_t::bound_update_loop(problem_t& pb, - timer_t timer) + termination_checker_t& timer) { termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT; + // TODO: proper work units + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + settings.iteration_limit = std::min(settings.iteration_limit, 50); + } + i_t iter; upd.init_changed_constraints(pb.handle_ptr); for (iter = 0; iter < settings.iteration_limit; ++iter) { @@ -229,7 +235,7 @@ termination_criterion_t bound_presolve_t::solve(problem_t& p i_t var_idx) { auto& handle_ptr = pb.handle_ptr; - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); copy_input_bounds(pb); upd.lb.set_element_async(var_idx, var_lb, handle_ptr->get_stream()); upd.ub.set_element_async(var_idx, var_ub, handle_ptr->get_stream()); @@ -242,7 +248,7 @@ termination_criterion_t bound_presolve_t::solve( const std::vector>& var_probe_val_pairs, bool use_host_bounds) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb.handle_ptr; if (use_host_bounds) { update_device_bounds(handle_ptr); @@ -257,7 +263,7 @@ termination_criterion_t bound_presolve_t::solve( template termination_criterion_t bound_presolve_t::solve(problem_t& pb) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb.handle_ptr; copy_input_bounds(pb); return bound_update_loop(pb, timer); diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh index 8b57cc7019..3c38460d92 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -60,7 +61,7 @@ class bound_presolve_t { void set_updated_bounds(const raft::handle_t* handle_ptr, raft::device_span output_lb, raft::device_span output_ub); - termination_criterion_t bound_update_loop(problem_t& pb, timer_t timer); + termination_criterion_t bound_update_loop(problem_t& pb, termination_checker_t& timer); void set_bounds(raft::device_span var_lb, raft::device_span var_ub, const std::vector>& var_probe_vals, diff --git a/cpp/src/mip_heuristics/presolve/bounds_update_data.cu b/cpp/src/mip_heuristics/presolve/bounds_update_data.cu index 487549aa4a..29eab4e69c 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_update_data.cu +++ b/cpp/src/mip_heuristics/presolve/bounds_update_data.cu @@ -35,6 +35,35 @@ void bounds_update_data_t::resize(problem_t& problem) changed_constraints.resize(problem.n_constraints, problem.handle_ptr->get_stream()); next_changed_constraints.resize(problem.n_constraints, problem.handle_ptr->get_stream()); changed_variables.resize(problem.n_variables, problem.handle_ptr->get_stream()); + + thrust::fill(problem.handle_ptr->get_thrust_policy(), + min_activity.begin(), + min_activity.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + max_activity.begin(), + max_activity.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + lb.begin(), + lb.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + ub.begin(), + ub.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + changed_constraints.begin(), + changed_constraints.end(), + -1); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + next_changed_constraints.begin(), + next_changed_constraints.end(), + -1); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + changed_variables.begin(), + changed_variables.end(), + -1); } template diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 13412614b8..24cac7129f 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -17,6 +17,12 @@ #include "cusparse.h" #include + +#include +#include +#include +#include + #include "conditional_bound_strengthening.cuh" #include diff --git a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu index 3a6d1bce21..caf401cc52 100644 --- a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include #include @@ -309,7 +311,7 @@ inline std::vector compute_prioritized_integer_indices( template void compute_probing_cache(load_balanced_bounds_presolve_t& bound_presolve, load_balanced_problem_t& problem, - timer_t timer) + termination_checker_t& timer) { // we dont want to compute the probing cache for all variables for time and computation resources auto priority_indices = compute_prioritized_integer_indices(bound_presolve, problem); @@ -400,7 +402,7 @@ void compute_probing_cache(load_balanced_bounds_presolve_t& bound_pres template void compute_probing_cache( \ load_balanced_bounds_presolve_t & bound_presolve, \ load_balanced_problem_t & problem, \ - timer_t timer); \ + termination_checker_t & timer); \ template class lb_probing_cache_t; #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu index 0d16c26cae..88c7c57b51 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -526,7 +527,7 @@ bool load_balanced_bounds_presolve_t::update_bounds_from_slack( template termination_criterion_t load_balanced_bounds_presolve_t::bound_update_loop( - const raft::handle_t* handle_ptr, timer_t timer) + const raft::handle_t* handle_ptr, termination_checker_t& timer) { termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT; @@ -626,7 +627,7 @@ termination_criterion_t load_balanced_bounds_presolve_t::solve(f_t var f_t var_ub, i_t var_idx) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb->handle_ptr; copy_input_bounds(*pb); vars_bnd.set_element_async(2 * var_idx, var_lb, handle_ptr->get_stream()); @@ -638,7 +639,7 @@ template termination_criterion_t load_balanced_bounds_presolve_t::solve( raft::device_span input_bounds) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb->handle_ptr; if (input_bounds.size() != 0) { raft::copy(vars_bnd.data(), input_bounds.data(), input_bounds.size(), handle_ptr->get_stream()); @@ -667,7 +668,7 @@ template termination_criterion_t load_balanced_bounds_presolve_t::solve( const std::vector>& var_probe_val_pairs, bool use_host_bounds) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb->handle_ptr; if (use_host_bounds) { update_device_bounds(handle_ptr); diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh index ff085ca962..488877d7fb 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh @@ -15,6 +15,7 @@ #include #include +#include #include #include "load_balanced_partition_helpers.cuh" @@ -159,7 +160,8 @@ class load_balanced_bounds_presolve_t { void calculate_constraint_slack_iter(const raft::handle_t* handle_ptr); bool update_bounds_from_slack(const raft::handle_t* handle_ptr); - termination_criterion_t bound_update_loop(const raft::handle_t* handle_ptr, timer_t timer); + termination_criterion_t bound_update_loop(const raft::handle_t* handle_ptr, + termination_checker_t& timer); bool calculate_infeasible_redundant_constraints(const raft::handle_t* handle_ptr); // void calculate_constraint_slack_on_problem_bounds(); diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh index cbcd91a7d7..f276840bdf 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -10,6 +10,7 @@ #include "load_balanced_bounds_presolve_kernels.cuh" #include "load_balanced_partition_helpers.cuh" +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cu b/cpp/src/mip_heuristics/presolve/multi_probe.cu index 7789b3281b..5321c7ff57 100644 --- a/cpp/src/mip_heuristics/presolve/multi_probe.cu +++ b/cpp/src/mip_heuristics/presolve/multi_probe.cu @@ -5,10 +5,13 @@ */ /* clang-format on */ +#include #include +#include #include #include +#include #include #include #include @@ -19,6 +22,15 @@ #include "bounds_update_helpers.cuh" #include "multi_probe.cuh" +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { // Tobias Achterberg, Robert E. Bixby, Zonghao Gu, Edward Rothberg, Dieter Weninger (2019) Presolve @@ -263,7 +275,7 @@ void multi_probe_t::set_bounds( template termination_criterion_t multi_probe_t::bound_update_loop(problem_t& pb, const raft::handle_t* handle_ptr, - timer_t timer) + termination_checker_t& timer) { termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT; skip_0 = false; @@ -279,7 +291,13 @@ termination_criterion_t multi_probe_t::bound_update_loop(problem_t::bound_update_loop(problem_tget_stream()), + detail::compute_hash(make_span(upd_0.ub), handle_ptr->get_stream()), + detail::compute_hash(make_span(upd_1.lb), handle_ptr->get_stream()), + detail::compute_hash(make_span(upd_1.ub), handle_ptr->get_stream()), + timer.remaining_time()); + return criteria; } @@ -343,6 +374,10 @@ void multi_probe_t::update_host_bounds( [] __device__(auto i) { return thrust::make_tuple(get_lower(i), get_upper(i)); }); raft::copy(host_lb.data(), var_lb.data(), var_lb.size(), handle_ptr->get_stream()); raft::copy(host_ub.data(), var_ub.data(), var_ub.size(), handle_ptr->get_stream()); + handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG("update_host_bounds: lb_hash=0x%x ub_hash=0x%x", + detail::compute_hash(make_span(var_lb), handle_ptr->get_stream()), + detail::compute_hash(make_span(var_ub), handle_ptr->get_stream())); } template @@ -375,7 +410,7 @@ termination_criterion_t multi_probe_t::solve_for_interval( const std::tuple, std::pair>& var_interval_vals, const raft::handle_t* handle_ptr) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); copy_problem_into_probing_buffers(pb, handle_ptr); set_interval_bounds(var_interval_vals, pb, handle_ptr); @@ -389,7 +424,7 @@ termination_criterion_t multi_probe_t::solve( const std::tuple, std::vector, std::vector>& var_probe_vals, bool use_host_bounds) { - timer_t timer(settings.time_limit); + termination_checker_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb.handle_ptr; if (use_host_bounds) { update_device_bounds(handle_ptr); diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cuh b/cpp/src/mip_heuristics/presolve/multi_probe.cuh index a043770789..0c182f0fcb 100644 --- a/cpp/src/mip_heuristics/presolve/multi_probe.cuh +++ b/cpp/src/mip_heuristics/presolve/multi_probe.cuh @@ -12,6 +12,7 @@ #include #include +#include #include #include "bounds_update_data.cuh" @@ -54,7 +55,7 @@ class multi_probe_t { i_t select_update); termination_criterion_t bound_update_loop(problem_t& pb, const raft::handle_t* handle_ptr, - timer_t timer); + termination_checker_t& timer); void set_interval_bounds( const std::tuple, std::pair>& var_interval_vals, problem_t& pb, diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 4f5e16ddb9..a0adbbd2eb 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -14,7 +14,10 @@ #include #include +#include +#include #include +#include #include #include @@ -367,7 +370,7 @@ void compute_cache_for_var(i_t var_idx, std::atomic& problem_is_infeasible, std::vector>& modification_vector, std::vector>& substitution_vector, - timer_t timer, + const termination_checker_t& timer, i_t device_id) { RAFT_CUDA_TRY(cudaSetDevice(device_id)); @@ -704,8 +707,11 @@ void apply_substitution_queue_to_problem( host_copy(problem.presolve_data.variable_mapping, problem.handle_ptr->get_stream()); problem.handle_ptr->sync_stream(); + // remove duplicate substitution proposals to avoid races later + std::unordered_set seen_substituted; for (const auto& [substituting_var, substitutions] : all_substitutions) { for (const auto& [substituted_var, substitution] : substitutions) { + if (!seen_substituted.insert(substitution.substituted_var).second) { continue; } CUOPT_LOG_TRACE("Applying substitution: %d -> %d", substitution.substituting_var, substitution.substituted_var); @@ -843,7 +849,7 @@ std::vector compute_priority_indices_by_implied_integers(problem_t bool compute_probing_cache(bound_presolve_t& bound_presolve, problem_t& problem, - timer_t timer) + termination_checker_t& timer) { raft::common::nvtx::range fun_scope("compute_probing_cache"); // we dont want to compute the probing cache for all variables for time and computation resources @@ -857,6 +863,12 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, bound_presolve.settings.iteration_limit = 50; bound_presolve.settings.time_limit = timer.remaining_time(); + // TODO: proper work unit accounting in deterministic mode for the probing cache + if ((bound_presolve.context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + bound_presolve.settings.iteration_limit = 1; + priority_indices.resize(std::min(priority_indices.size(), 2048)); + } + size_t num_threads = bound_presolve.settings.num_threads < 0 ? 0.2 * omp_get_max_threads() : bound_presolve.settings.num_threads; @@ -949,7 +961,7 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, #define INSTANTIATE(F_TYPE) \ template bool compute_probing_cache(bound_presolve_t & bound_presolve, \ problem_t & problem, \ - timer_t timer); \ + termination_checker_t & timer); \ template class probing_cache_t; #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cuh b/cpp/src/mip_heuristics/presolve/probing_cache.cuh index 91da6a15c8..26ea2991fc 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cuh +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cuh @@ -11,6 +11,7 @@ #include +#include #include namespace cuopt::linear_programming::detail { @@ -119,6 +120,6 @@ class lb_probing_cache_t { template bool compute_probing_cache(bound_presolve_t& bound_presolve, problem_t& problem, - timer_t timer); + termination_checker_t& timer); } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp index d94cf5aa67..d36cab5173 100644 --- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp +++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp @@ -541,10 +541,34 @@ void check_postsolve_status(const papilo::PostsolveStatus& status) } } +// Wrapper to run papilo presolvers in TBB sequential mode +// This is necessary due to a bug in commit <> in Probing +// that causes nondeterminism. Disable it when running in deterministic mode. +template