Differences

This shows you the differences between two versions of the page.

--- melange:papers:fall2021 [2021/09/13 20:09]
corentin
+++ melange:papers:fall2021 [2021/09/23 08:07]
corentin
@@ Line 23: / Line 23: @@
   address   = {New York, NY, USA},
   url       = {https://doi.org/10.1145/3453483.3454079},
-  abstract  = {Parallel reduction, which summarizes a given dataset, e.g., the total, average, and
+  abstract  = {Parallel reduction, which summarizes a given dataset, e.g., the total, average, and maximum, plays a crucial role in parallel programming. This paper presents a new approach, reverse engineering, to automatically discovering nontrivial parallel reductions in sequential programs. The body of the sequential reduction loop is regarded as a black box, and its input-output behaviors are sampled. If the behaviors correspond to a set of linear polynomials over a semiring, a divide-and-conquer parallel reduction is generated. Auxiliary reverse-engineering methods enable a long and nested loop body to be decomposed, which makes our parallelization scheme applicable to various types of reduction loops. This approach is not only simple and efficient but also agnostic to the details of the input program. Its potential is demonstrated through several use case scenarios. A proof-of-concept implementation successfully inferred linear polynomials for nearly all of the 74 benchmarks exhaustively collected from the literature. These characteristics and experimental results demonstrate the promise of the proposed approach, despite its inherent unsoundness.},
-maximum, plays a crucial role in parallel programming. This paper presents a new approach,
-reverse engineering, to automatically discovering nontrivial parallel reductions in
-sequential programs. The body of the sequential reduction loop is regarded as a black
-box, and its input-output behaviors are sampled. If the behaviors correspond to a
-set of linear polynomials over a semiring, a divide-and-conquer parallel reduction
-is generated. Auxiliary reverse-engineering methods enable a long and nested loop
-body to be decomposed, which makes our parallelization scheme applicable to various
-types of reduction loops. This approach is not only simple and efficient but also
-agnostic to the details of the input program. Its potential is demonstrated through
-several use case scenarios. A proof-of-concept implementation successfully inferred
-linear polynomials for nearly all of the 74 benchmarks exhaustively collected from
-the literature. These characteristics and experimental results demonstrate the promise
-of the proposed approach, despite its inherent unsoundness.},
   booktitle = {Proceedings of the 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation},
+  loc       = {Proceedings of the 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation},
+  number    = {2021},
   pages     = {820–834},
   numpages  = {15}
@@ Line 51: / Line 40: @@
   url       = {https://doi.org/10.1145/3243176.3243204},
   doi       = {10.1145/3243176.3243204},
-  abstract  = {Many sequential loops are actually recurrences and can be parallelized across iterations
+  abstract  = {Many sequential loops are actually recurrences and can be parallelized across iterations as scans or reductions. Many efforts over the past 2+ decades have focused on parallelizing such loops by extracting and exploiting the hidden scan/reduction patterns. These approaches have largely been based on a heuristic search for closed-form composition of computations across loop iterations.While the search-based approaches are successful in parallelizing many recurrences, they have a large search overhead and need extensive program analysis. In this work, we propose a novel approach called sampling-and-reconstruction, which avoids the search for closed-form composition and has the potential to cover more recurrence loops. It is based on an observation that many recurrences can have a point-value representation. The loop iterations are divided across processors, and where the initial value(s) of the recurrence variable(s) are unknown, we execute with several chosen (sampling) initial values. Then, correct final result can be obtained by reconstructing the function from the outputs produced on the chosen initial values. Our approach is effective in parallelizing linear, rectified-linear, finite-state and multivariate recurrences, which cover all of the test cases in previous works. Our evaluation shows that our approach can parallelize a diverse set of sequential loops, including cases that cannot be parallelized by a state-of-the-art static parallelization tool, and achieves linear scalability across multiple cores.},
-as scans or reductions. Many efforts over the past 2+ decades have focused on parallelizing
-such loops by extracting and exploiting the hidden scan/reduction patterns. These
-approaches have largely been based on a heuristic search for closed-form composition
-of computations across loop iterations.While the search-based approaches are successful
-in parallelizing many recurrences, they have a large search overhead and need extensive
-program analysis. In this work, we propose a novel approach called sampling-and-reconstruction,
-which avoids the search for closed-form composition and has the potential to cover
-more recurrence loops. It is based on an observation that many recurrences can have
-a point-value representation. The loop iterations are divided across processors, and
-where the initial value(s) of the recurrence variable(s) are unknown, we execute with
-several chosen (sampling) initial values. Then, correct final result can be obtained
-by reconstructing the function from the outputs produced on the chosen initial values.
-Our approach is effective in parallelizing linear, rectified-linear, finite-state
-and multivariate recurrences, which cover all of the test cases in previous works.
-Our evaluation shows that our approach can parallelize a diverse set of sequential
-loops, including cases that cannot be parallelized by a state-of-the-art static parallelization
-tool, and achieves linear scalability across multiple cores.},
   booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
+  loc       = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
+  number    = {2018},
   articleno = {10},
   numpages  = {13},
@@ Line 75: / Line 49: @@
   location  = {Limassol, Cyprus},
   series    = {PACT '18}
+}
+@misc{blleloch2019improved,
+  title         = {Improved Parallel Cache-Oblivious Algorithms for Dynamic Programming and Linear Algebra},
+  author        = {Guy E. Blleloch and Yan Gu},
+  year          = {2019},
+  eprint        = {1809.09330},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.DS},
+  loc           = {arXiv},
+  number        = {1809.09330},
+  url           = {https://arxiv.org/abs/1809.09330}
+}
+@inproceedings{Henry_2021,
+  title         = {Compilation of Sparse Array Programming Models},
+  author        = {Rawn Henry, Olivia Hsu, Rohan Yadav, Stephen Chou, Kunle Olukotun, Saman Amarasinghe, and Fredrik
+Kjolstad},
+  year          = {2021},
+  articleno     = {128},
+  numpages      = {29},
+  url           = {http://fredrikbk.com/publications/Sparse_Array_Programming.pdf},
+  publisher     = {Association for Computing Machinery},
+  loc           = {Proc. ACM Program. Lang. 5},
+  number        = {},
+  doi           = {10.1145/3485505}
 }

AlphaZ

User Tools

Site Tools

Differences

Page Tools