|
|
— |
melange:papers:fall2015 [2015/11/10 09:40] (current) |
| @inproceedings{Acharya:2015:PNM:2688500.2688512, |
| author = {Acharya, Aravind and Bondhugula, Uday}, |
| title = {PLUTO+: Near-complete Modeling of Affine Transformations for |
| Parallelism and Locality}, |
| booktitle = {Proceedings of the 20th ACM SIGPLAN Symposium on Principles and |
| Practice of Parallel Programming}, |
| series = {PPoPP 2015}, |
| year = {2015}, |
| isbn = {978-1-4503-3205-7}, |
| location = {San Francisco, CA, USA}, |
| pages = {54--64}, |
| numpages = {11}, |
| url = {http://doi.acm.org/10.1145/2688500.2688512}, |
| doi = {10.1145/2688500.2688512}, |
| acmid = {2688512}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {Affine transformations, affine scheduling, automatic |
| parallelization, polyhedral model, stencil computations, tiling}, |
| } |
| |
| @INPROCEEDINGS{7161519, |
| author={Tithi, J.J. and Ganapathi, P. and Talati, A. and Aggarwal, S. and Chowdhury, R.}, |
| booktitle={Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE International}, |
| title={High-Performance Energy-Efficient Recursive Dynamic Programming with Matrix-Multiplication-Like Flexible Kernels}, |
| year={2015}, |
| month={May}, |
| pages={303-312}, |
| keywords={divide and conquer methods;dynamic programming;mathematics computing;matrix multiplication;parallel algorithms;DP problem;FW-APSP;Floyd-Warshall all-pairs shortest path;cache-oblivious recursive divide-and-conquer;dynamic programming;gap penalty;high-performing parallel implementation;matrix-multiplication-like flexible kernel;optimization;parallel CORDAC algorithm;cache-oblivious;divide-and-conquer;dynamic programming;flexible kernel;polyhedral compiler;recursive}, |
| doi={10.1109/IPDPS.2015.107}, |
| ISSN={1530-2075}, |
| } |
| |
| @inproceedings{Bondhugula:2014:TOT:2628071.2628106, |
| author = {Bondhugula, Uday and Bandishti, Vinayaka and Cohen, Albert and Potron, Guillain and Vasilache, Nicolas}, |
| title = {Tiling and Optimizing Time-iterated Computations on Periodic Domains}, |
| booktitle = {Proceedings of the 23rd International Conference on Parallel Architectures and Compilation}, |
| series = {PACT '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2809-8}, |
| location = {Edmonton, AB, Canada}, |
| pages = {39--50}, |
| numpages = {12}, |
| url = {http://doi.acm.org/10.1145/2628071.2628106}, |
| doi = {10.1145/2628071.2628106}, |
| acmid = {2628106}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {automatic parallelization, periodic, polyhedral model, stencils, tiling}, |
| } |
| |
| |
| @inproceedings{Tang:2015:CWI:2688500.2688514, |
| author = {Tang, Yuan and You, Ronghui and Kan, Haibin and Tithi, Jesmin Jahan and Ganapathi, Pramod and Chowdhury, Rezaul A.}, |
| title = {Cache-oblivious Wavefront: Improving Parallelism of Recursive Dynamic Programming Algorithms Without Losing Cache-efficiency}, |
| booktitle = {Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, |
| series = {PPoPP 2015}, |
| year = {2015}, |
| isbn = {978-1-4503-3205-7}, |
| location = {San Francisco, CA, USA}, |
| pages = {205--214}, |
| numpages = {10}, |
| url = {http://doi.acm.org/10.1145/2688500.2688514}, |
| doi = {10.1145/2688500.2688514}, |
| acmid = {2688514}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {Cilk, cache-oblivious parallel algorithms, cache-oblivious wavefront, dynamic programming, multi-core, nested parallel computation}, |
| } |
| |
| |
| @inproceedings{Huang:2014:ARD:2628071.2628089, |
| author = {Huang, Cheng-Chieh and Nagarajan, Vijay}, |
| title = {ATCache: Reducing DRAM Cache Latency via a Small SRAM Tag Cache}, |
| booktitle = {Proceedings of the 23rd International Conference on Parallel Architectures and Compilation}, |
| series = {PACT '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2809-8}, |
| location = {Edmonton, AB, Canada}, |
| pages = {51--60}, |
| numpages = {10}, |
| url = {http://doi.acm.org/10.1145/2628071.2628089}, |
| doi = {10.1145/2628071.2628089}, |
| acmid = {2628089}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {design, dram cache, performance}, |
| } |
| |
| @inproceedings{Fatehi:2014:ITS:2628071.2628093, |
| author = {Fatehi, Ehsan and Gratz, Paul}, |
| title = {ILP and TLP in Shared Memory Applications: A Limit Study}, |
| booktitle = {Proceedings of the 23rd International Conference on Parallel Architectures and Compilation}, |
| series = {PACT '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2809-8}, |
| location = {Edmonton, AB, Canada}, |
| pages = {113--126}, |
| numpages = {14}, |
| url = {http://doi.acm.org/10.1145/2628071.2628093}, |
| doi = {10.1145/2628071.2628093}, |
| acmid = {2628093}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {instruction-level parallelism (ilp), limits, pthreads, thread-level parallelism (tlp)}, |
| } |
| |
| |
| @inproceedings{Cameron:2014:BDP:2628071.2628079, |
| author = {Cameron, Robert D. and Shermer, Thomas C. and Shriraman, Arrvindh and Herdy, Kenneth S. and Lin, Dan and Hull, Benjamin R. and Lin, Meng}, |
| title = {Bitwise Data Parallelism in Regular Expression Matching}, |
| booktitle = {Proceedings of the 23rd International Conference on Parallel Architectures and Compilation}, |
| series = {PACT '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2809-8}, |
| location = {Edmonton, AB, Canada}, |
| pages = {139--150}, |
| numpages = {12}, |
| url = {http://doi.acm.org/10.1145/2628071.2628079}, |
| doi = {10.1145/2628071.2628079}, |
| acmid = {2628079}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {parallel bit streams, regular expression matching}, |
| } |
| |
| @inproceedings{Ansel:2014:OEF:2628071.2628092, |
| author = {Ansel, Jason and Kamil, Shoaib and Veeramachaneni, Kalyan and Ragan-Kelley, Jonathan and Bosboom, Jeffrey and O'Reilly, Una-May and Amarasinghe, Saman}, |
| title = {OpenTuner: An Extensible Framework for Program Autotuning}, |
| booktitle = {Proceedings of the 23rd International Conference on Parallel Architectures and Compilation}, |
| series = {PACT '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2809-8}, |
| location = {Edmonton, AB, Canada}, |
| pages = {303--316}, |
| numpages = {14}, |
| url = {http://doi.acm.org/10.1145/2628071.2628092}, |
| doi = {10.1145/2628071.2628092}, |
| acmid = {2628092}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {autotuner, optimization}, |
| } |
| |
| |
| @INPROCEEDINGS{6835968, |
| author={Hayenga, M. and Naresh, V.R.K. and Lipasti, M.H.}, |
| booktitle={High Performance Computer Architecture (HPCA), 2014 IEEE 20th International Symposium}, |
| title={Revolver: Processor architecture for power efficient loop execution}, |
| year={2014}, |
| month={Feb}, |
| pages={591-602}, |
| keywords={computer architecture;energy conservation;instruction sets;power aware computing;Revolver architecture;branch prediction;dispatch logic;energy efficiency;frontend instruction dispatches;instruction fetch;loop buffers;loop execution;loop iterations;micro-op cache techniques;out-of-order execution core;out-of-order processor architecture;power efficient loop execution;processor core;processor frontend;static instruction instances;Arrays;Clocks;Out of order;Pipelines;Rain;Registers;Resource management}, |
| doi={10.1109/HPCA.2014.6835968}, |
| } |
| |
| |
| @INPROCEEDINGS{5377644, |
| author={Shafiq, M. and Pericas, M. and de la Cruz, R. and Araya-Polo, M. and Navarro, N. and Ayguade, E.}, |
| booktitle={Field-Programmable Technology, 2009. FPT 2009.}, |
| title={Exploiting memory customization in FPGA for 3D stencil computations}, |
| year={2009}, |
| month={Dec}, |
| pages={38-45}, |
| keywords={field programmable gate arrays;signal processing;3D stencil computations;FPGA;IBM PowerXCell 8i;data reuse;memory customization;memory organization;memory-bound kernels;Bandwidth;Computer applications;Field programmable gate arrays;Finite difference methods;Finite impulse response filter;Hardware;Kernel;Nearest neighbor searches;Throughput;Time domain analysis}, |
| doi={10.1109/FPT.2009.5377644}, |
| } |
| |
| |
| @inproceedings{Wahib:2015:AGK:2749246.2749255, |
| author = {Wahib, Mohamed and Maruyama, Naoya}, |
| title = {Automated GPU Kernel Transformations in Large-Scale Production Stencil Applications}, |
| booktitle = {Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing}, |
| series = {HPDC '15}, |
| year = {2015}, |
| isbn = {978-1-4503-3550-8}, |
| location = {Portland, Oregon, USA}, |
| pages = {259--270}, |
| numpages = {12}, |
| url = {http://doi.acm.org/10.1145/2749246.2749255}, |
| doi = {10.1145/2749246.2749255}, |
| acmid = {2749255}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {cuda, gpu, source-to-source translation, stencil computations}, |
| } |
| |
| |
| @inproceedings{Benson:2015:FPP:2688500.2688513, |
| author = {Benson, Austin R. and Ballard, Grey}, |
| title = {A Framework for Practical Parallel Fast Matrix Multiplication}, |
| booktitle = {Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, |
| series = {PPoPP 2015}, |
| year = {2015}, |
| isbn = {978-1-4503-3205-7}, |
| location = {San Francisco, CA, USA}, |
| pages = {42--53}, |
| numpages = {12}, |
| url = {http://doi.acm.org/10.1145/2688500.2688513}, |
| doi = {10.1145/2688500.2688513}, |
| acmid = {2688513}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {dense linear algebra, fast matrix multiplication, parallel linear algebra, shared memory}, |
| } |
| |
| |
| @inproceedings{Sukhija:2014:PSR:2672598.2672904, |
| author = {Sukhija, Nitin and Malone, Brandon and Srivastava, Srishti and Banicescu, Ioana and Ciorba, Florina M.}, |
| title = {Portfolio-Based Selection of Robust Dynamic Loop Scheduling Algorithms Using Machine Learning}, |
| booktitle = {Proceedings of the 2014 IEEE International Parallel \& Distributed Processing Symposium Workshops}, |
| series = {IPDPSW '14}, |
| year = {2014}, |
| isbn = {978-1-4799-4116-2}, |
| pages = {1638--1647}, |
| numpages = {10}, |
| url = {http://dx.doi.org/10.1109/IPDPSW.2014.183}, |
| doi = {10.1109/IPDPSW.2014.183}, |
| acmid = {2672904}, |
| publisher = {IEEE Computer Society}, |
| address = {Washington, DC, USA}, |
| keywords = {Dynamic loop scheduling, robustness, algorithm selection, empirical robustness prediction models, machine learning techniques, variable system availability}, |
| } |
| |
| @incollection{Tiwari2014, |
| year={2014}, |
| isbn={978-3-319-09872-2}, |
| booktitle={Euro-Par 2014 Parallel Processing}, |
| series={Euro-Par 2014 Parallel Processing}, |
| volume={8632}, |
| editor={Silva, Fernando and Dutra, Ins and Santos Costa, Vtor}, |
| doi={10.1007/978-3-319-09873-9_6}, |
| title={Modeling the Impact of Reduced Memory Bandwidth on HPC Applications}, |
| url={http://dx.doi.org/10.1007/978-3-319-09873-9_6}, |
| publisher={Springer International Publishing}, |
| author={Tiwari, Ananta and Gamst, Anthony and Laurenzano, MichaelA. and Schulz, Martin and Carrington, Laura}, |
| pages={63-74}, |
| language={English} |
| } |
| |
| |
| @INPROCEEDINGS{7056046, |
| author={Agarwal, N. and Nellans, D. and O'Connor, M. and Keckler, S.W. and Wenisch, T.F.}, |
| booktitle={High Performance Computer Architecture (HPCA), 2015 IEEE 21st International Symposium}, |
| title={Unlocking bandwidth for GPUs in CC-NUMA systems}, |
| year={2015}, |
| month={Feb}, |
| pages={354-365}, |
| keywords={cache storage;graphics processing units;parallel processing;storage management;CC-NUMA GPU-CPU systems;CPU memory bandwidth;GDDR memory;GPU kernel;GPU memory bandwidth;GPU relaxed memory semantics;GPU-based HPC applications;aggressive memory prefetching;bandwidth balancing;hardware cache-coherence;memory-intensive GPU workloads;minimal hardware support;on-demand software page migration;oracular page placement;software runtime system;virtual address-based program locality;Bandwidth;Graphics processing units;Hardware;Memory management;Random access memory;Runtime}, |
| doi={10.1109/HPCA.2015.7056046}, |
| } |
| |
| |
| @INPROCEEDINGS{6270616, |
| author={Changyou Zhang and Kun Huang and Xiang Cui and Yifeng Chen}, |
| booktitle={Parallel and Distributed Processing Symposium Workshops PhD Forum (IPDPSW), 2012 IEEE 26th International}, |
| title={Power-aware Programming with GPU Accelerators}, |
| year={2012}, |
| month={May}, |
| pages={2443-2449}, |
| keywords={graphics processing units;multi-threading;multiprocessing systems;power aware computing;ubiquitous computing;GPU accelerators;high-level program development;manycore processor;multithreaded processor;on-chip parallelism;parallel processor;power consumption values;power efficiency;power estimation;power-aware programming;processor computational power;processor memory bandwidth;program statements;ubiquitous computing;Bandwidth;Graphics processing unit;Hardware;Memory management;Message systems;Power demand;Power measurement;GPU;Power-aware;Primitive;Programming}, |
| doi={10.1109/IPDPSW.2012.301}, |
| } |
| |
| |
| @inproceedings{Fang:2014:TIX:2568088.2576799, |
| author = {Fang, Jianbin and Sips, Henk and Zhang, LiLun and Xu, Chuanfu and Che, Yonggang and Varbanescu, Ana Lucia}, |
| title = {Test-driving Intel Xeon Phi}, |
| booktitle = {Proceedings of the 5th ACM/SPEC International Conference on Performance Engineering}, |
| series = {ICPE '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2733-6}, |
| location = {Dublin, Ireland}, |
| pages = {137--148}, |
| numpages = {12}, |
| url = {http://doi.acm.org/10.1145/2568088.2576799}, |
| doi = {10.1145/2568088.2576799}, |
| acmid = {2576799}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {experience with xeon phi, microbenchmarking, optimization, performance analysis}, |
| } |
| |
| |
| @inproceedings{Ravishankar:2015:DMC:2688500.2688515, |
| author = {Ravishankar, Mahesh and Dathathri, Roshan and Elango, Venmugil and Pouchet, Louis-Noël and Ramanujam, J. and Rountev, Atanas and Sadayappan, P.}, |
| title = {Distributed Memory Code Generation for Mixed Irregular/Regular Computations}, |
| booktitle = {Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, |
| series = {PPoPP 2015}, |
| year = {2015}, |
| isbn = {978-1-4503-3205-7}, |
| location = {San Francisco, CA, USA}, |
| pages = {65--75}, |
| numpages = {11}, |
| url = {http://doi.acm.org/10.1145/2688500.2688515}, |
| doi = {10.1145/2688500.2688515}, |
| acmid = {2688515}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {Distributed Memory, Inspector/Executor, Irregular Computation, Polyhedral Compilation}, |
| } |
| |
| |
| @inproceedings{Cong:2014:OMS:2593069.2593090, |
| author = {Cong, Jason and Li, Peng and Xiao, Bingjun and Zhang, Peng}, |
| title = {An Optimal Microarchitecture for Stencil Computation Acceleration Based on Non-Uniform Partitioning of Data Reuse Buffers}, |
| booktitle = {Proceedings of the 51st Annual Design Automation Conference}, |
| series = {DAC '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2730-5}, |
| location = {San Francisco, CA, USA}, |
| pages = {77:1--77:6}, |
| articleno = {77}, |
| numpages = {6}, |
| url = {http://doi.acm.org/10.1145/2593069.2593090}, |
| doi = {10.1145/2593069.2593090}, |
| acmid = {2593090}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| } |
| |
| @ARTICLE{6470606, |
| author={Sano, K. and Hatsuda, Y. and Yamamoto, S.}, |
| journal={Parallel and Distributed Systems, IEEE Transactions}, |
| title={Multi-FPGA Accelerator for Scalable Stencil Computation with Constant Memory Bandwidth}, |
| year={2014}, |
| month={March}, |
| volume={25}, |
| number={3}, |
| pages={695-705}, |
| keywords={field programmable gate arrays;parallel processing;storage management;CCM;GPU;Jacobi computation;SSA;custom computing machine;deep pipelining approach;domain-specific programmable concept;field programmable gate array;graphics processing unit;high-performance stencil computations;memory bandwidth;multiFPGA accelerator;multicore microprocessors;scalable stencil computation;scalable streaming-array;scientific computations;Arrays;Bandwidth;Computational modeling;Field programmable gate arrays;Hardware;Scalability;FPGA;Scalable streaming-array;custom computing machine;high-performance computation;stencil computation}, |
| doi={10.1109/TPDS.2013.51}, |
| ISSN={1045-9219}, |
| } |
| |
| |
| |
| @inproceedings{Fan:2006:IHE:1176254.1176322, |
| author = {Fan, Kevin and Kudlur, Manjunath and Park, Hyunchul and Mahlke, |
| Scott}, |
| title = {Increasing Hardware Efficiency with Multifunction Loop |
| Accelerators}, |
| booktitle = {Proceedings of the 4th International Conference on |
| Hardware/Software Codesign and System Synthesis}, |
| series = {CODES+ISSS '06}, |
| year = {2006}, |
| isbn = {1-59593-370-0}, |
| location = {Seoul, Korea}, |
| pages = {276--281}, |
| numpages = {6}, |
| url = {http://doi.acm.org/10.1145/1176254.1176322}, |
| doi = {10.1145/1176254.1176322}, |
| acmid = {1176322}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {application-specific hardware, high-level synthesis, loop |
| accelerator, modulo scheduling, multifunction design}, |
| } |
| |
| @article{Meswani:2013:MPP:2493921.2493922, |
| author = {Meswani, Mitesh R. and Carrington, Laura and Unat, Didem and |
| Snavely, Allan and Baden, Scott and Poole, Stephen}, |
| title = {Modeling and Predicting Performance of High Performance Computing |
| Applications on Hardware Accelerators}, |
| journal = {Int. J. High Perform. Comput. Appl.}, |
| issue_date = {May 2013}, |
| volume = {27}, |
| number = {2}, |
| month = may, |
| year = {2013}, |
| issn = {1094-3420}, |
| pages = {89--108}, |
| numpages = {20}, |
| url = {http://dx.doi.org/10.1177/1094342012468180}, |
| doi = {10.1177/1094342012468180}, |
| acmid = {2493922}, |
| publisher = {Sage Publications, Inc.}, |
| address = {Thousand Oaks, CA, USA}, |
| keywords = {FPGA, GPU, HPC, accelerators, benchmarking, idioms, performance |
| modeling, performance prediction}, |
| } |
| |
| |
| @article{Nery:2013:HRM:2537182.2537569, |
| author = {Nery, Alexandre S. and Jozwiak, Lech and Lindwer, Menno and Cocco, Mauro and Nedjah, Nadia and Franca, Felipe M. G.}, |
| title = {Hardware Reuse in Modern Application-specific Processors and |
| Accelerators}, |
| journal = {Microprocess. Microsyst.}, |
| issue_date = {August, 2013}, |
| volume = {37}, |
| number = {6-7}, |
| month = aug, |
| year = {2013}, |
| issn = {0141-9331}, |
| pages = {684--692}, |
| numpages = {9}, |
| url = {http://dx.doi.org/10.1016/j.micpro.2012.06.005}, |
| doi = {10.1016/j.micpro.2012.06.005}, |
| acmid = {2537569}, |
| publisher = {Elsevier Science Publishers B. V.}, |
| address = {Amsterdam, The Netherlands, The Netherlands}, |
| keywords = {Application-specific processors, Area reduction, Hardware |
| accelerator, Power reduction, Resource sharing}, |
| } |
| |
| |
| |
| @inproceedings{Bandishti:2012:TSC:2388996.2389051, |
| author = {Bandishti, Vinayaka and Pananilath, Irshad and Bondhugula, Uday}, |
| title = {Tiling Stencil Computations to Maximize Parallelism}, |
| booktitle = {Proceedings of the International Conference on High Performance |
| Computing, Networking, Storage and Analysis}, |
| series = {SC '12}, |
| year = {2012}, |
| isbn = {978-1-4673-0804-5}, |
| location = {Salt Lake City, Utah}, |
| pages = {40:1--40:11}, |
| articleno = {40}, |
| numpages = {11}, |
| url = {http://dl.acm.org/citation.cfm?id=2388996.2389051}, |
| acmid = {2389051}, |
| publisher = {IEEE Computer Society Press}, |
| address = {Los Alamitos, CA, USA}, |
| keywords = {compilers, program transformation}, |
| } |
| |
| |
| @inproceedings{Wonnacott13, |
| Author = {Dave G. Wonnacott and Michelle Mills Strout}, |
| Booktitle = {Proceedings of the 3rd International Workshop on Polyhedral Compilation Techniques (IMPACT)}, |
| series = {IMPACT 2013}, |
| Month = {January}, |
| Title = {On the Scalability of Loop Tiling Techniques}, |
| Year = {2013}, |
| url={http://impact.gforge.inria.fr/impact2013/papers/impact2013_on_the_scalability_of_loop_tiling_techniques.pdf} |
| } |
| |
| |
| @inproceedings{Pugh:1991:OTF:125826.125848, |
| author = {Pugh, William}, |
| title = {The Omega Test: A Fast and Practical Integer Programming Algorithm |
| for Dependence Analysis}, |
| booktitle = {Proceedings of the 1991 ACM/IEEE Conference on Supercomputing}, |
| series = {Supercomputing '91}, |
| year = {1991}, |
| isbn = {0-89791-459-7}, |
| location = {Albuquerque, New Mexico, USA}, |
| pages = {4--13}, |
| numpages = {10}, |
| url = {http://doi.acm.org/10.1145/125826.125848}, |
| doi = {10.1145/125826.125848}, |
| acmid = {125848}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| } |
| |
| |
| @inproceedings{cohenautomatic, |
| title={Automatic Intra-Array Storage Optimization}, |
| author={Bhaskaracharya, Somashekaracharya G and Bondhugula, Uday and Cohen, Albert }, |
| url={http://www.csa.iisc.ernet.in/TR/2014/3/paper.pdf}, |
| series={IISc-CSA-TR-2014-3, Nov 2014 and submitted to ACM TOPLAS, Feb 2015}, |
| year={2015}, |
| } |
| |
| @inproceedings{Stock:2014:FED:2594291.2594342, |
| author = {Stock, Kevin and Kong, Martin and Grosser, Tobias and Pouchet, |
| Louis-Noël and Rastello, Fabrice and Ramanujam, J. and Sadayappan, P.}, |
| title = {A Framework for Enhancing Data Reuse via Associative Reordering}, |
| booktitle = {Proceedings of the 35th ACM SIGPLAN Conference on Programming |
| Language Design and Implementation}, |
| series = {PLDI '14}, |
| year = {2014}, |
| isbn = {978-1-4503-2784-8}, |
| location = {Edinburgh, United Kingdom}, |
| pages = {65--76}, |
| numpages = {12}, |
| url = {http://doi.acm.org/10.1145/2594291.2594342}, |
| doi = {10.1145/2594291.2594342}, |
| acmid = {2594342}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| } |
| |
| @article{Cilardo:2015:IMM:2695583.2675359, |
| author = {Cilardo, Alessandro and Gallo, Luca}, |
| title = {Improving Multibank Memory Access Parallelism with Lattice-Based |
| Partitioning}, |
| journal = {ACM Trans. Archit. Code Optim.}, |
| issue_date = {January 2015}, |
| volume = {11}, |
| number = {4}, |
| month = jan, |
| year = {2015}, |
| issn = {1544-3566}, |
| pages = {45:1--45:25}, |
| articleno = {45}, |
| numpages = {25}, |
| url = {http://doi.acm.org/10.1145/2675359}, |
| doi = {10.1145/2675359}, |
| acmid = {2675359}, |
| publisher = {ACM}, |
| address = {New York, NY, USA}, |
| keywords = {Memory partitioning, field-programmable gate arrays, fine-grained |
| distributed shared memory, polyhedral model}, |
| } |
| |