tnc/contractionpath/
communication_schemes.rs

1use std::fmt;
2
3use itertools::Itertools;
4use rand::distr::Uniform;
5use rand::Rng;
6use rustc_hash::FxHashMap;
7
8use crate::contractionpath::contraction_cost::communication_path_cost;
9use crate::contractionpath::paths::cotengrust::{Cotengrust, OptMethod};
10use crate::contractionpath::paths::weighted_branchbound::WeightedBranchBound;
11use crate::contractionpath::paths::{ContractionPathResult, CostType, Pathfinder};
12use crate::contractionpath::SimplePath;
13use crate::tensornetwork::partitioning::{communication_partitioning, PartitioningStrategy};
14use crate::tensornetwork::tensor::{CompositeTensor, LeafTensor};
15
16/// The scheme used to find a contraction path for the final fan-in of tensors
17/// between MPI ranks.
18#[derive(Debug, Copy, Clone)]
19pub enum CommunicationScheme {
20    /// Uses Greedy scheme to find contraction path for communication
21    Greedy,
22    /// Uses a randomized greedy approach
23    RandomGreedy,
24    /// Uses repeated bipartitioning to identify communication path
25    Bipartition,
26    /// Uses repeated bipartitioning to identify communication path
27    BipartitionSweep,
28    /// Uses a filtered search that considered time to intermediate tensor
29    WeightedBranchBound,
30    /// Uses a filtered search
31    BranchBound,
32}
33
34impl fmt::Display for CommunicationScheme {
35    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
36        let comm_str = match self {
37            Self::Greedy => "greedy",
38            Self::RandomGreedy => "random_greedy",
39            Self::Bipartition => "bipartition",
40            Self::BipartitionSweep => "bipartition_sweep",
41            Self::WeightedBranchBound => "weightedbranchbound",
42            Self::BranchBound => "branchbound",
43        };
44        f.write_str(comm_str)
45    }
46}
47
48impl CommunicationScheme {
49    pub(crate) fn communication_path<R>(
50        &self,
51        children_tensors: &[LeafTensor],
52        latency_map: &FxHashMap<usize, f64>,
53        rng: Option<&mut R>,
54    ) -> SimplePath
55    where
56        R: Rng,
57    {
58        match self {
59            Self::Greedy => greedy(children_tensors, latency_map),
60            Self::RandomGreedy => random_greedy(children_tensors),
61            Self::Bipartition => bipartition(children_tensors, latency_map),
62            Self::BipartitionSweep => {
63                let Some(rng) = rng else {
64                    panic!("BipartitionSweep requires a random number generator")
65                };
66                bipartition_sweep(children_tensors, latency_map, rng)
67            }
68
69            Self::WeightedBranchBound => weighted_branchbound(children_tensors, latency_map),
70            Self::BranchBound => branchbound(children_tensors),
71        }
72    }
73}
74
75fn greedy(children_tensors: &[LeafTensor], _latency_map: &FxHashMap<usize, f64>) -> SimplePath {
76    let communication_tensors = CompositeTensor::new(children_tensors.to_vec());
77    let mut opt = Cotengrust::new(OptMethod::Greedy);
78    let result = opt.find_path(&communication_tensors);
79    result.replace_path().into_simple()
80}
81
82fn bipartition(
83    children_tensors: &[LeafTensor],
84    _latency_map: &FxHashMap<usize, f64>,
85) -> SimplePath {
86    let children_tensors = children_tensors.iter().cloned().enumerate().collect_vec();
87    let imbalance = 0.03;
88    tensor_bipartition(&children_tensors, imbalance)
89}
90
91fn bipartition_sweep<R>(
92    children_tensors: &[LeafTensor],
93    latency_map: &FxHashMap<usize, f64>,
94    rng: &mut R,
95) -> SimplePath
96where
97    R: Rng,
98{
99    let tensors = children_tensors.iter().cloned().enumerate().collect_vec();
100    let mut best_flops = f64::INFINITY;
101    let mut best_path = vec![];
102    let partition_latencies = latency_map
103        .iter()
104        .sorted_by_key(|(k, _)| **k)
105        .map(|(_, v)| *v)
106        .collect::<Vec<_>>();
107    for _ in 0..20 {
108        let imbalance = rng.sample(Uniform::new(0.01, 0.5).unwrap());
109        let path = tensor_bipartition(&tensors, imbalance);
110        let (flops, _) = communication_path_cost(
111            children_tensors,
112            &path,
113            true,
114            true,
115            Some(&partition_latencies),
116        );
117        if flops < best_flops {
118            best_flops = flops;
119            best_path = path;
120        }
121    }
122    best_path
123}
124
125fn weighted_branchbound(
126    children_tensors: &[LeafTensor],
127    latency_map: &FxHashMap<usize, f64>,
128) -> SimplePath {
129    let communication_tensors = CompositeTensor::new(children_tensors.to_vec());
130
131    let mut opt = WeightedBranchBound::new(Some(10), 5., latency_map.clone(), CostType::Flops);
132    let result = opt.find_path(&communication_tensors);
133    result.replace_path().into_simple()
134}
135
136fn branchbound(children_tensors: &[LeafTensor]) -> SimplePath {
137    let communication_tensors = CompositeTensor::new(children_tensors.to_vec());
138    let latency_map = (0..children_tensors.len()).map(|i| (i, 0.0)).collect();
139
140    let mut opt = WeightedBranchBound::new(Some(10), 5., latency_map, CostType::Flops);
141    let result = opt.find_path(&communication_tensors);
142    result.replace_path().into_simple()
143}
144
145/// Uses recursive bipartitioning to identify a communication scheme for final tensors
146/// Returns root id of subtree, parallel contraction cost as f64, resultant tensor and prior contraction sequence
147fn tensor_bipartition_recursive(
148    children_tensor: &[(usize, LeafTensor)],
149    imbalance: f64,
150) -> (usize, LeafTensor, SimplePath) {
151    let k = 2;
152    let min = true;
153
154    // Composite tensor contracts with a single leaf tensor
155    if children_tensor.len() == 1 {
156        return (
157            children_tensor[0].0,
158            children_tensor[0].1.clone(),
159            Vec::new(),
160        );
161    }
162
163    // Only occurs when there is a subset of 2 tensors
164    if children_tensor.len() == 2 {
165        // Always ensure that the larger tensor size is on the left.
166        let (t1, t2) = if children_tensor[1].1.size() > children_tensor[0].1.size() {
167            (children_tensor[1].0, children_tensor[0].0)
168        } else {
169            (children_tensor[0].0, children_tensor[1].0)
170        };
171        let tensor = &children_tensor[0].1 ^ &children_tensor[1].1;
172
173        return (t1, tensor, vec![(t1, t2)]);
174    }
175
176    let partitioning = communication_partitioning(
177        children_tensor,
178        k,
179        imbalance,
180        PartitioningStrategy::MinCut,
181        min,
182    );
183
184    let mut partition_iter = partitioning.iter();
185    let (children_1, children_2): (Vec<_>, Vec<_>) = children_tensor
186        .iter()
187        .cloned()
188        .partition(|_| partition_iter.next() == Some(&0));
189
190    let (id_1, t1, mut contraction_1) = tensor_bipartition_recursive(&children_1, imbalance);
191
192    let (id_2, t2, mut contraction_2) = tensor_bipartition_recursive(&children_2, imbalance);
193
194    let tensor = &t1 ^ &t2;
195
196    contraction_1.append(&mut contraction_2);
197    let (id_1, id_2) = if t2.size() > t1.size() {
198        (id_2, id_1)
199    } else {
200        (id_1, id_2)
201    };
202
203    contraction_1.push((id_1, id_2));
204    (id_1, tensor, contraction_1)
205}
206
207/// Repeatedly bipartitions tensor network to obtain communication scheme
208/// Assumes that all tensors contracted do so in parallel
209fn tensor_bipartition(children_tensor: &[(usize, LeafTensor)], imbalance: f64) -> SimplePath {
210    let (_, _, contraction_path) = tensor_bipartition_recursive(children_tensor, imbalance);
211    contraction_path
212}
213
214fn random_greedy(children_tensors: &[LeafTensor]) -> SimplePath {
215    let communication_tensors = CompositeTensor::new(children_tensors.to_vec());
216
217    let mut opt = Cotengrust::new(OptMethod::RandomGreedy(100));
218    let result = opt.find_path(&communication_tensors);
219    result.replace_path().into_simple()
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    use itertools::Itertools;
227    use rustc_hash::FxHashMap;
228
229    use crate::contractionpath::contraction_cost::communication_path_cost;
230
231    fn setup_simple_partition_data() -> FxHashMap<usize, f64> {
232        FxHashMap::from_iter([(0, 40.), (1, 30.), (2, 50.)])
233    }
234
235    /// Tensor ids in contraction tree included in variable name for easy tracking
236    /// This example prioritizes contracting tensor1 & tensor 2 using the greedy cost function
237    /// However, the partition cost of tensor 2 is very high, which makes contracting it later more attractive by reducing wait-time
238    fn setup_simple() -> Vec<LeafTensor> {
239        let bond_dims =
240            FxHashMap::from_iter([(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 2), (6, 2)]);
241
242        let tensor0 = LeafTensor::new_from_map(vec![3, 4, 5], &bond_dims);
243        let tensor1 = LeafTensor::new_from_map(vec![0, 1, 3, 4], &bond_dims);
244        let tensor2 = LeafTensor::new_from_map(vec![0, 1, 2, 5, 6], &bond_dims);
245        vec![tensor0, tensor1, tensor2]
246    }
247
248    #[test]
249    fn test_greedy_communication() {
250        let tensors = setup_simple();
251        let latency_map = setup_simple_partition_data();
252        let communication_scheme = greedy(&tensors, &latency_map);
253
254        assert_eq!(&communication_scheme, &[(0, 1), (0, 2)]);
255        let tensor_costs = (0..tensors.len()).map(|i| latency_map[&i]).collect_vec();
256        let (flop_cost, mem_cost) = communication_path_cost(
257            &tensors,
258            &communication_scheme,
259            true,
260            true,
261            Some(&tensor_costs),
262        );
263        assert_eq!(flop_cost, 104.);
264        assert_eq!(mem_cost, 44.);
265    }
266
267    #[test]
268    fn test_weighted_communication() {
269        let tensors = setup_simple();
270        let latency_map = setup_simple_partition_data();
271
272        let communication_scheme = weighted_branchbound(&tensors, &latency_map);
273
274        assert_eq!(&communication_scheme, &[(1, 0), (2, 1)]);
275        // Flop Cost: (1, 0) = 32 , Tensor cost = 40, Total = 72
276        // Flop Cost: (2, 1) = 32, Tensor cost = 50
277        // max(72, 50) + 32 = 104
278        // Mem Cost: (2, 1) = 2^3 + 2^5 + 2^2 = 44
279        let tensor_costs = (0..tensors.len()).map(|i| latency_map[&i]).collect_vec();
280        let (flop_cost, mem_cost) = communication_path_cost(
281            &tensors,
282            &communication_scheme,
283            true,
284            true,
285            Some(&tensor_costs),
286        );
287
288        assert_eq!(flop_cost, 104.);
289        assert_eq!(mem_cost, 44.);
290    }
291
292    #[test]
293    fn test_bi_partition_communication() {
294        let tensors = setup_simple();
295        let latency_map = setup_simple_partition_data();
296
297        let communication_scheme = bipartition(&tensors, &latency_map);
298
299        assert_eq!(&communication_scheme, &[(2, 1), (2, 0)]);
300
301        // Flop Cost: (2, 1) = 128, Tensor cost = 50, Total = 178
302        // Flop Cost: (2, 0) = 32 , Tensor cost = 40
303        // max(178, 40) + 32 = 210
304        // Mem Cost: (2, 1) = 2^4 + 2^5 + 2^5 = 80
305        let tensor_costs = (0..tensors.len()).map(|i| latency_map[&i]).collect_vec();
306        let (flop_cost, mem_cost) = communication_path_cost(
307            &tensors,
308            &communication_scheme,
309            true,
310            true,
311            Some(&tensor_costs),
312        );
313
314        assert_eq!(flop_cost, 210.);
315        assert_eq!(mem_cost, 80.);
316    }
317}
tnc/contractionpath/communication_schemes.rs

tnc/contractionpath/
communication_schemes.rs