Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/coverity.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
revision: latest_available_commit
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
revision: latest_available_commit
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ jobs:
platform: macos_14_7
arch: 'arm64'
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
revision: latest_available_commit
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/manylinux_2_28.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ jobs:
with:
platform: almalinux8
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
revision: latest_available_commit
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request

- name: Clone docker tag from OpenVINO repo
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
with:
platform: windows
commit_packages_to_provide: wheels,openvino_node_npm_package.zip
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
revision: latest_available_commit
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,6 @@ namespace genai {

namespace utils {

/** Applies transformations to the ov::Model to enable paged attention inference.
* @param model Pointer to the ov::Model representing one of the supported LLM architectures.
* @param device_config Configuration struct for inferencing device specifics.
* @param per_layer_cache_control If true, then the transformations will enable per-layer control of KV cache blocks, allowing to specify
* different sets of KV cache blocks for different attention layers. If false, then the KV cache block structure will be identical across all
* decoder layers.
* @param allow_cache_rotation If true, then the transformations will enable additional per-layer inputs to perform re-rotation of specific
* blocks (in a RoPE fashion) before the inference step.
* @param allow_xattention If true, then the transformations will enable additional per-layer inputs to control the XAttention block-sparse
* attention optimization.
*/
void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false, bool allow_cache_rotation = false, bool allow_xattention = false);

void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);

} // namespace utils
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/continuous_batching/pipeline_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#endif

#include "openvino/genai/text_streamer.hpp"
#include "openvino/pass/sdpa_to_paged_attention.hpp"
#include "continuous_batching/pipeline_impl.hpp"
#include "utils.hpp"
#include "continuous_batching/paged_attention_transformations.hpp"
Expand Down Expand Up @@ -76,7 +77,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
bool allow_cache_rotation = scheduler_config.cache_eviction_config.apply_rotation;
bool allow_xattention = scheduler_config.use_sparse_attention && scheduler_config.sparse_attention_config.mode == SparseAttentionMode::XATTENTION;
utils::apply_paged_attention_transformations(model, is_need_per_layer_cache_control, allow_cache_rotation, allow_xattention);
bool allow_score_aggregation = true;
ov::pass::SDPAToPagedAttention(is_need_per_layer_cache_control, is_need_per_layer_cache_control, allow_score_aggregation, allow_cache_rotation, allow_xattention).run_on_model(model);
utils::apply_gather_before_matmul_transformation(model);

initialize_pipeline(model, scheduler_config, device, properties);
Expand Down
15 changes: 12 additions & 3 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <thread>

#include "openvino/genai/text_streamer.hpp"
#include "openvino/pass/sdpa_to_paged_attention.hpp"
#include "speculative_decoding_impl.hpp"
#include "continuous_batching/paged_attention_transformations.hpp"
#include "utils.hpp"
Expand Down Expand Up @@ -35,9 +36,17 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con

auto main_scheduler_config = main_model_desc.scheduler_config;
auto main_device = main_model_desc.device;

utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
bool allow_score_aggregation = true;
bool allow_xattention = false;

ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
main_model_desc.scheduler_config.use_cache_eviction,
allow_score_aggregation,
allow_xattention).run_on_model(main_model);
ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
main_model_desc.scheduler_config.use_cache_eviction,
allow_score_aggregation,
allow_xattention).run_on_model(draft_model);

utils::apply_gather_before_matmul_transformation(main_model);
utils::apply_gather_before_matmul_transformation(draft_model);
Expand Down
Loading