User Tools

Site Tools


melange:papers:spring2018

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
Next revision Both sides next revision
melange:papers:spring2018 [2018/02/21 11:36]
prerana
melange:papers:spring2018 [2018/02/21 17:13]
sanjay
Line 1: Line 1:
 @article{Bielecki:2016:TAN:3060371.3060383, @article{Bielecki:2016:TAN:3060371.3060383,
- author = {Bielecki, W\lodzimierz and Pa\lkowski, Marek},+ author = {Bielecki, Wlodzimierz and Pa\lkowski, Marek},
  title = {Tiling Arbitrarily Nested Loops by Means of the Transitive},  title = {Tiling Arbitrarily Nested Loops by Means of the Transitive},
  journal = {Int. J. Appl. Math. Comput. Sci.},  journal = {Int. J. Appl. Math. Comput. Sci.},
Line 48: Line 48:
 number="6", number="6",
 pages="607--631", pages="607--631",
-abstract="There are many algorithms for the space-time mapping of nested loops. Some of them even make the optimal choices within their framework. We propose a preprocessing phase for algorithms in the polytope model, which extends the model and yields space-time mappings whose schedule is, in some cases, orders of magnitude faster. These are cases in which the dependence graph has small irregularities. The basic idea is to split the index set of the loop nests into parts with a regular dependence structure and apply the existing space-time mapping algorithms to these parts individually. This work is based on a seminal idea in the more limited context of loop parallelization at the code level. We elevate the idea to the model level (our model is the polytope model), which increases its applicability by providing a clearer and wider range of choices at an acceptable analysis cost. Index set splitting is one facet in the effort to extend the power of the polytope model and to enable the generation of competitive target code.", 
-issn="1573-7640", 
 doi="10.1023/A:1007516818651", doi="10.1023/A:1007516818651",
 url="https://doi.org/10.1023/A:1007516818651" url="https://doi.org/10.1023/A:1007516818651"
Line 79: Line 77:
 number={1},  number={1}, 
 pages={127-138},  pages={127-138}, 
-keywords={DRAM chips;data flow computing;energy conservation;feedforward neural nets;learning (artificial intelligence);neural net architecture;power aware computing;reconfigurable architectures;AI systems;AlexNet;CNN shapes;DRAM accesses;Eyeriss;MAC;RS dataflow reconfiguration;accelerator chip;convolutional layers;data movement energy cost;dataflow processing;deep convolutional neural networks;energy efficiency;energy-efficient reconfigurable accelerator;multiply and accumulation;off-chip DRAM;reconfiguring architecture;row stationary;spatial architecture;Clocks;Computer architecture;Hardware;Neural networks;Random access memory;Shape;Throughput;Convolutional neural networks (CNNs);dataflow processing;deep learning;energy-efficient accelerators;spatial architecture}, 
 url = {http://ieeexplore.ieee.org/document/7738524/},  url = {http://ieeexplore.ieee.org/document/7738524/}, 
 doi={10.1109/JSSC.2016.2616357},  doi={10.1109/JSSC.2016.2616357}, 
Line 187: Line 184:
 number="2", number="2",
 pages="95--113", pages="95--113",
-abstract="The parallelization of many algorithms can be obtained using space-time transformations which are applied on nested do-loops or on recurrence equations. In this paper, we analyze systems of linear recurrence equations, a generalization of uniform recurrence equations. The first part of the paper describes a method for finding automatically whether such a system can be scheduled by an affine timing function, independent of the size parameter of the algorithm. In the second part, we describe a powerful method that makes it possible to transform linear recurrences into uniform recurrence equations. Both parts rely on results on integral convex polyhedra. Our results are illustrated on the Gauss elimination algorithm and on the Gauss-Jordan diagonalization algorithm.", 
 issn="0922-5773", issn="0922-5773",
 doi="10.1007/BF02477176", doi="10.1007/BF02477176",
Line 203: Line 199:
 month=          "Feb", month=          "Feb",
 url=            "http://www.cs.colostate.edu/~cs560/Spring2011/Notes/FeautrierEDFAijpp91.pdf" url=            "http://www.cs.colostate.edu/~cs560/Spring2011/Notes/FeautrierEDFAijpp91.pdf"
-annote=         "This article explains how a simple imperative language 
-        program (consisting only of assignments, for loops with affine loop 
-        limits, and arrays with affine index expressions), can be statically 
-        analyzed to find the flow dependencies." 
 } }
  
Line 418: Line 410:
  
 @article{cummins2017synthesizing, @article{cummins2017synthesizing,
- 
    title={Synthesizing benchmarks for predictive modeling},    title={Synthesizing benchmarks for predictive modeling},
- 
    author={Cummins, Chris and Petoumenos, Pavlos and Wang, Zheng and Leather, Hugh},    author={Cummins, Chris and Petoumenos, Pavlos and Wang, Zheng and Leather, Hugh},
- 
    year={2017},    year={2017},
-    
    url={http://homepages.inf.ed.ac.uk/hleather/publications/2017-benchsynth-cgo.pdf}    url={http://homepages.inf.ed.ac.uk/hleather/publications/2017-benchsynth-cgo.pdf}
- 
 } }
  
 @article{optimistic2017, @article{optimistic2017,
- 
    title={Optimistic Loop Optimization},    title={Optimistic Loop Optimization},
- 
    author={Doerfert, Johannes and Grosser, Tobias and Hack, Sebastian},    author={Doerfert, Johannes and Grosser, Tobias and Hack, Sebastian},
    url = {http://dl.acm.org/citation.cfm?id=3049832.3049864},    url = {http://dl.acm.org/citation.cfm?id=3049832.3049864},
    year={2017}    year={2017}
- 
 } }
  
Line 483: Line 467:
 month = {February}, month = {February},
 url = {https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CNN20Whitepaper.pdf}, url = {https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CNN20Whitepaper.pdf},
-abstract = { 
- 
-We describe the design of a convolutional neural network accelerator running on a Stratix V FPGA. The design runs at three times the throughput of previous FPGA CNN accelerator designs. We show that the throughput/watt is significantly higher than for a GPU, and project the performance when ported to an Arria 10 FPGA. 
- 
- 
-}, 
 publisher = {Microsoft Research}, publisher = {Microsoft Research},
 url = {https://www.microsoft.com/en-us/research/publication/accelerating-deep-convolutional-neural-networks-using-specialized-hardware/}, url = {https://www.microsoft.com/en-us/research/publication/accelerating-deep-convolutional-neural-networks-using-specialized-hardware/},
melange/papers/spring2018.txt ยท Last modified: 2018/04/18 10:30 by prerana