@@ -31,83 +31,6 @@ func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi
3131
3232// -----
3333
34- #map = affine_map <() -> ()>
35- func.func @extract_scalar_from_0d_into_0d (%src: tensor <f32 >, %init: tensor <f32 >) -> tensor <f32 > {
36- %res = linalg.generic {
37- indexing_maps = [#map ],
38- iterator_types = []
39- } outs (%init : tensor <f32 >) {
40- ^bb0 (%in: f32 ):
41- %1 = tensor.extract %src [] : tensor <f32 >
42- linalg.yield %1 : f32
43- } -> tensor <f32 >
44-
45- return %res : tensor <f32 >
46- }
47-
48- // CHECK-LABEL: func.func @extract_scalar_from_0d_into_0d(
49- // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
50- // CHECK-SAME: %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
51- // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
52- // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
53- // CHECK: vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
54-
55- // -----
56-
57- #map = affine_map <(n ) -> (n )>
58- func.func @extract_scalar_from_0d_into_1d (%src: tensor <f32 >, %init: tensor <1 xf32 >) -> tensor <1 xf32 > {
59- %res = linalg.generic {
60- indexing_maps = [#map ],
61- iterator_types = [" parallel" ]
62- } outs (%init : tensor <1 xf32 >) {
63- ^bb0 (%in: f32 ):
64- %1 = tensor.extract %src [] : tensor <f32 >
65- linalg.yield %1 : f32
66- } -> tensor <1 xf32 >
67-
68- return %res : tensor <1 xf32 >
69- }
70- // CHECK-LABEL: func.func @extract_scalar_from_0d_into_1d(
71- // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
72- // CHECK-SAME: %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
73- // CHECK: %[[C0:.*]] = arith.constant 0 : index
74- // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
75- // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
76- // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
77- // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
78-
79- // -----
80-
81- #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
82- func.func @vectorize_nd_tensor_extract_scalar_broadcast (%src: tensor <3 x3 xf32 >, %init: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
83- %c0 = arith.constant 1 : index
84- %c1 = arith.constant 2 : index
85-
86- %res = linalg.generic {
87- indexing_maps = [#map ],
88- iterator_types = [" parallel" , " parallel" , " parallel" ]
89- } outs (%init : tensor <1 x1 x3 xf32 >) {
90- ^bb0 (%arg4: f32 ):
91- %1 = tensor.extract %src [%c0 , %c1 ] : tensor <3 x3 xf32 >
92- linalg.yield %1 : f32
93- } -> tensor <1 x1 x3 xf32 >
94-
95- return %res : tensor <1 x1 x3 xf32 >
96- }
97-
98- // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_scalar_broadcast(
99- // CHECK-SAME: %[[SRC:.*]]: tensor<3x3xf32>,
100- // CHECK-SAME: %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
101- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
102- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
103- // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
104- // CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
105- // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
106- // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
107- // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
108-
109- // -----
110-
11134#map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
11235func.func @vectorize_nd_tensor_extract_transfer_read_basic (
11336 %arg0: tensor <3 x3 x3 xf32 >,
@@ -144,37 +67,6 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic(
14467// CHECK: %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[IDX1]], %[[IDX2]], %[[IDX3]]], %[[CST]] {in_bounds = [true, true, true]} : tensor<3x3x3xf32>, vector<1x1x3xf32>
14568// CHECK: vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
14669
147- // Same as example above, but reading into a column tensor.
148-
149- // TODO: Currently this fails to vectorise when the indices are non-constant.
150-
151- func.func @vectorize_nd_tensor_extract_transfer_read_basic_column (
152- %input: tensor <3 x3 x3 xf32 >,
153- %output: tensor <3 x1 x1 xf32 >) -> tensor <3 x1 x1 xf32 > {
154-
155- %c0 = arith.constant 0 : index
156- %res = linalg.generic {
157- indexing_maps = [#map ],
158- iterator_types = [" parallel" , " parallel" , " parallel" ]
159- } outs (%output : tensor <3 x1 x1 xf32 >) {
160- ^bb0 (%out: f32 ):
161- %5 = tensor.extract %input [%c0 , %c0 , %c0 ] : tensor <3 x3 x3 xf32 >
162- linalg.yield %5 : f32
163- } -> tensor <3 x1 x1 xf32 >
164-
165- return %res : tensor <3 x1 x1 xf32 >
166- }
167-
168- // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
169- // CHECK-SAME: %[[INPUT:.*]]: tensor<3x3x3xf32>,
170- // CHECK-SAME: %[[OUTPUT:.*]]: tensor<3x1x1xf32>)
171- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
172- // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
173- // CHECK: %[[READ:.*]] = vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
174- // CHECK: %[[BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
175- // CHECK: %[[RES:.*]] = vector.transfer_write %[[BCAST]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
176- // CHECK: return %[[RES]] : tensor<3x1x1xf32>
177-
17870 // -----
17971
18072func.func @vectorize_nd_tensor_extract_transfer_read_complex (%6: tensor <45 x80 x16 xf32 >, %arg0: index , %arg2: index , %arg1: index , %arg4: index , %extracted_slice : tensor <1 x4 xf32 >) -> tensor <1 x4 xf32 > {
@@ -620,26 +512,6 @@ func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1:
620512
621513// -----
622514
623- #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
624- func.func @vectorize_0d_tensor_extract (%arg0: tensor <f32 >, %arg2: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
625- %2 = linalg.generic {
626- indexing_maps = [#map1 ],
627- iterator_types = [" parallel" , " parallel" , " parallel" ]
628- } outs (%arg2 : tensor <1 x1 x3 xf32 >) {
629- ^bb0 (%arg4: f32 ):
630- %7 = tensor.extract %arg0 [] : tensor <f32 >
631- linalg.yield %7 : f32
632- } -> tensor <1 x1 x3 xf32 >
633- return %2 : tensor <1 x1 x3 xf32 >
634- }
635-
636- // CHECK-LABEL: func.func @vectorize_0d_tensor_extract(
637- // CHECK-SAME: %[[ARG_0:.*]]: tensor<f32>
638- // CHECK: %[[EXTRACT:.*]] = vector.transfer_read %[[ARG_0]][], %{{.+}} : tensor<f32>
639- // CHECK: vector.broadcast %[[EXTRACT]] : vector<f32> to vector<1x1x3xf32>
640-
641- // -----
642-
643515#map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
644516#map1 = affine_map <(d0 , d1 , d2 ) -> (d0 + d1 + d2 )>
645517func.func @vectorize_reverse_like_tensor_extract (%arg0: tensor <1 x2 x3 xf32 >, %arg1: tensor <1 x1 x3 xf32 >, %arg2: index ) -> tensor <1 x1 x3 xf32 > {
@@ -674,17 +546,118 @@ func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1
674546// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
675547// CHECK: vector.transfer_write %[[GATHER]]
676548
549+ //===----------------------------------------------------------------------===//
550+ // Scalar load + broadcast
551+ //===----------------------------------------------------------------------===//
552+
553+ // -----
554+
555+ #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
556+ func.func @vectorize_nd_tensor_extract_scalar_broadcast (%src: tensor <3 x3 xf32 >, %init: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
557+ %c0 = arith.constant 1 : index
558+ %c1 = arith.constant 2 : index
559+
560+ %res = linalg.generic {
561+ indexing_maps = [#map ],
562+ iterator_types = [" parallel" , " parallel" , " parallel" ]
563+ } outs (%init : tensor <1 x1 x3 xf32 >) {
564+ ^bb0 (%arg4: f32 ):
565+ %1 = tensor.extract %src [%c0 , %c1 ] : tensor <3 x3 xf32 >
566+ linalg.yield %1 : f32
567+ } -> tensor <1 x1 x3 xf32 >
568+
569+ return %res : tensor <1 x1 x3 xf32 >
570+ }
571+
572+ // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_scalar_broadcast(
573+ // CHECK-SAME: %[[SRC:.*]]: tensor<3x3xf32>,
574+ // CHECK-SAME: %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
575+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
576+ // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
577+ // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
578+ // CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
579+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
580+ // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
581+ // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
582+
677583// -----
678584
679- func.func @vectorize_scalar_read_with_broadcast_from_column_tensor (%init: tensor <1 x1 x4 xi32 >) -> tensor <1 x1 x4 xi32 > {
585+ #map = affine_map <() -> ()>
586+ func.func @extract_scalar_from_0d_into_0d (%src: tensor <f32 >, %init: tensor <f32 >) -> tensor <f32 > {
587+ %res = linalg.generic {
588+ indexing_maps = [#map ],
589+ iterator_types = []
590+ } outs (%init : tensor <f32 >) {
591+ ^bb0 (%in: f32 ):
592+ %1 = tensor.extract %src [] : tensor <f32 >
593+ linalg.yield %1 : f32
594+ } -> tensor <f32 >
595+
596+ return %res : tensor <f32 >
597+ }
598+
599+ // CHECK-LABEL: func.func @extract_scalar_from_0d_into_0d(
600+ // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
601+ // CHECK-SAME: %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
602+ // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
603+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
604+ // CHECK: vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
605+
606+ // -----
607+
608+ #map = affine_map <(n ) -> (n )>
609+ func.func @extract_scalar_from_0d_into_1d (%src: tensor <f32 >, %init: tensor <1 xf32 >) -> tensor <1 xf32 > {
610+ %res = linalg.generic {
611+ indexing_maps = [#map ],
612+ iterator_types = [" parallel" ]
613+ } outs (%init : tensor <1 xf32 >) {
614+ ^bb0 (%in: f32 ):
615+ %1 = tensor.extract %src [] : tensor <f32 >
616+ linalg.yield %1 : f32
617+ } -> tensor <1 xf32 >
618+
619+ return %res : tensor <1 xf32 >
620+ }
621+ // CHECK-LABEL: func.func @extract_scalar_from_0d_into_1d(
622+ // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
623+ // CHECK-SAME: %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
624+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
625+ // CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
626+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
627+ // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
628+ // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
629+
630+ // -----
631+
632+ #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
633+ func.func @vectorize_0d_tensor_extract (%src: tensor <f32 >, %init: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
634+ %res = linalg.generic {
635+ indexing_maps = [#map1 ],
636+ iterator_types = [" parallel" , " parallel" , " parallel" ]
637+ } outs (%init : tensor <1 x1 x3 xf32 >) {
638+ ^bb0 (%arg4: f32 ):
639+ %1 = tensor.extract %src [] : tensor <f32 >
640+ linalg.yield %1 : f32
641+ } -> tensor <1 x1 x3 xf32 >
642+ return %res : tensor <1 x1 x3 xf32 >
643+ }
644+
645+ // CHECK-LABEL: func.func @vectorize_0d_tensor_extract(
646+ // CHECK-SAME: %[[SRC:.*]]: tensor<f32>
647+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %{{.+}} : tensor<f32>
648+ // CHECK: vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
649+
650+ // -----
651+
652+ func.func @scalar_read_with_broadcast_from_column_tensor (%init: tensor <1 x1 x4 xi32 >) -> tensor <1 x1 x4 xi32 > {
680653 %c4 = arith.constant 4 : index
681654 %c0 = arith.constant 0 : index
682655 %src = arith.constant dense <[[0 ], [1 ], [2 ], [3 ], [4 ], [5 ], [6 ], [7 ], [8 ], [9 ], [10 ], [11 ], [12 ], [13 ], [14 ]]> : tensor <15 x1 xi32 >
683656
684657 %res = linalg.generic {
685658 indexing_maps = [affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>],
686- iterator_types = [" parallel" , " parallel" , " parallel" ]}
687- outs (%init : tensor <1 x1 x4 xi32 >) {
659+ iterator_types = [" parallel" , " parallel" , " parallel" ]
660+ } outs (%init : tensor <1 x1 x4 xi32 >) {
688661
689662 ^bb0 (%out: i32 ):
690663 %idx = linalg.index 0 : index
@@ -695,13 +668,45 @@ func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor
695668 return %res : tensor <1 x1 x4 xi32 >
696669}
697670
698- // CHECK-LABEL: func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(
671+ // CHECK-LABEL: func.func @scalar_read_with_broadcast_from_column_tensor
699672// CHECK-SAME: %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
700- // CHECK: %[[PAD:.*]] = arith.constant 0 : i32
701- // CHECK: %[[C0:.*]] = arith.constant 0 : index
702- // CHECK: %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
703- // CHECK: %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
673+ // CHECK-DAG: %[[PAD:.*]] = arith.constant 0 : i32
674+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
675+ // CHECK-DAG: %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
676+ // CHECK-DAG: %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
704677// CHECK: %[[IDX_ELT:.*]] = vector.extract %[[IDX_VEC]][0] : index from vector<1xindex>
705678// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
706679// CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>
707680// CHECK: %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
681+
682+ // -----
683+
684+ // TODO: Currently this fails to vectorise when the indices are non-constant.
685+
686+ #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
687+ func.func @vectorize_nd_tensor_extract_transfer_read_basic_column (
688+ %src: tensor <3 x3 x3 xf32 >,
689+ %init: tensor <3 x1 x1 xf32 >) -> tensor <3 x1 x1 xf32 > {
690+
691+ %c0 = arith.constant 0 : index
692+
693+ %res = linalg.generic {
694+ indexing_maps = [#map ],
695+ iterator_types = [" parallel" , " parallel" , " parallel" ]
696+ } outs (%init : tensor <3 x1 x1 xf32 >) {
697+ ^bb0 (%out: f32 ):
698+ %1 = tensor.extract %src [%c0 , %c0 , %c0 ] : tensor <3 x3 x3 xf32 >
699+ linalg.yield %1 : f32
700+ } -> tensor <3 x1 x1 xf32 >
701+
702+ return %res : tensor <3 x1 x1 xf32 >
703+ }
704+
705+ // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
706+ // CHECK-SAME: %[[SRC:.*]]: tensor<3x3x3xf32>,
707+ // CHECK-SAME: %[[INIT:.*]]: tensor<3x1x1xf32>)
708+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
709+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
710+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
711+ // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
712+ // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
0 commit comments