Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .github/workflows/coverity.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,7 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
revision: latest_available_commit

- name: Clone docker tag from OpenVINO repo
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,7 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
revision: latest_available_commit

- name: Clone docker tag from OpenVINO repo
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,7 @@ jobs:
platform: macos_14_7
arch: 'arm64'
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
revision: latest_available_commit

genai_build_cmake:
name: Build cpack - ${{ matrix.build-type }}
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,7 @@ jobs:
with:
platform: windows
commit_packages_to_provide: wheels,openvino_node_npm_package.zip
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
# Set specific revision and uncomment to use OV from its PR build:
# branch_name: master
# event_name: pull_request
revision: latest_available_commit

genai_build_cpack:
name: genai cpack (${{ matrix.build-type }})
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,6 @@ namespace genai {

namespace utils {

/** Applies transformations to the ov::Model to enable paged attention inference.
* @param model Pointer to the ov::Model representing one of the supported LLM architectures.
* @param device_config Configuration struct for inferencing device specifics.
* @param per_layer_cache_control If true, then the transformations will enable per-layer control of KV cache blocks, allowing to specify
* different sets of KV cache blocks for different attention layers. If false, then the KV cache block structure will be identical across all
* decoder layers.
* @param allow_cache_rotation If true, then the transformations will enable additional per-layer inputs to perform re-rotation of specific
* blocks (in a RoPE fashion) before the inference step.
* @param allow_xattention If true, then the transformations will enable additional per-layer inputs to control the XAttention block-sparse
* attention optimization.
*/
void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false, bool allow_cache_rotation = false, bool allow_xattention = false);

void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);

} // namespace utils
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/continuous_batching/pipeline_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#endif

#include "openvino/genai/text_streamer.hpp"
#include "openvino/pass/sdpa_to_paged_attention.hpp"
#include "continuous_batching/pipeline_impl.hpp"
#include "utils.hpp"
#include "continuous_batching/paged_attention_transformations.hpp"
Expand Down Expand Up @@ -76,7 +77,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
bool allow_cache_rotation = scheduler_config.cache_eviction_config.apply_rotation;
bool allow_xattention = scheduler_config.use_sparse_attention && scheduler_config.sparse_attention_config.mode == SparseAttentionMode::XATTENTION;
utils::apply_paged_attention_transformations(model, is_need_per_layer_cache_control, allow_cache_rotation, allow_xattention);
bool allow_score_aggregation = true;
ov::pass::SDPAToPagedAttention(is_need_per_layer_cache_control, is_need_per_layer_cache_control, allow_score_aggregation, allow_cache_rotation, allow_xattention).run_on_model(model);
utils::apply_gather_before_matmul_transformation(model);

initialize_pipeline(model, scheduler_config, device, properties);
Expand Down
15 changes: 12 additions & 3 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <thread>

#include "openvino/genai/text_streamer.hpp"
#include "openvino/pass/sdpa_to_paged_attention.hpp"
#include "speculative_decoding_impl.hpp"
#include "continuous_batching/paged_attention_transformations.hpp"
#include "utils.hpp"
Expand Down Expand Up @@ -35,9 +36,17 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con

auto main_scheduler_config = main_model_desc.scheduler_config;
auto main_device = main_model_desc.device;

utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
bool allow_score_aggregation = true;
bool allow_xattention = false;

ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
main_model_desc.scheduler_config.use_cache_eviction,
allow_score_aggregation,
allow_xattention).run_on_model(main_model);
ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
main_model_desc.scheduler_config.use_cache_eviction,
allow_score_aggregation,
allow_xattention).run_on_model(draft_model);

utils::apply_gather_before_matmul_transformation(main_model);
utils::apply_gather_before_matmul_transformation(draft_model);
Expand Down
Loading