@article{15c4db8f03ba4a48b29166065ac354cf,
title = "Parallelizing Strassen's method for matrix multiplication on distributed-memory MIMD architectures",
abstract = "We present a parallel method for matrix multiplication on distributed-memory MIMD architectures based on Strassen's method. Our timing tests, performed on a 56-node Intel Paragon, demonstrate the realization of the potential of the Strassen's method with a complexity of 4.7 M2.807 at the system level rather than the node level at which several earlier works have been focused. The parallel efficiency is nearly perfect when the processor number is the power of 7. The parallelized Strassen's method seems always faster than the traditional matrix multiplication methods whose complexity is 2M3 coupled with the BMR method and the Ring method at the system level. The speed gain depends on matrix order M: 20% for M ≈ 1000 and more than 100% for M ≈ 5000.",
keywords = "Matrix multiplication, Parallel computation, Strassen's method",
author = "Chou, {C. C.} and Deng, {Y. F.} and G. Li and Y. Wang",
note = "Funding Information: Of course, the full potential of these efficient methods can be realized only on large matrices, which require large machines such as parallel computers. Thus, designing efficient parallel algorithms for these methods becomes essential. The parallelization of the general linear algebra routines on distributed-memory MIMD architectures has achieved reasonable success \[7\]. But, due to the complication of these dedicated MM methods, the progress has been relatively slower. In fact, Manber \[8\]i n 1989 claimed that S-method cannot be easily parallelized. In addition, the Winograd method has not been parallelized so far. Indeed, the attempts for the parallelization This project was initiated after a fruitful conversation of Y.-F.D. with S.-T. Yau of Harvard University, in the summer of 1993 in Hong Kong. Y.-F.D. thanks D. Scott and W.-L. Wang of Intel Supercomputer Systems Division for discussions on local Strassen implementation, and D. Trystram who brought their unpublished work to our attention, and C.-C.C. thanks S. Huss-Lederman on their BMR implementation on Intel Delta machine. All four of us are supported by the Army Research Office, Grant DAAL03-92-G-0185 and through the Mathematical Sciences Institute of Cornell University under subcontract to the University at Stony Brook, Alto contract number DAAL03-91-C-0027, and the National Science Foundation, Grant DMS-9201581. *Author to whom all correspondence should be sent. Email address: Yuefan.Deng{\textcopyright}stmysb. edu",
year = "1995",
month = jul,
doi = "10.1016/0898-1221(95)00077-C",
language = "English (US)",
volume = "30",
pages = "49--69",
journal = "Computers and Mathematics with Applications",
issn = "0898-1221",
publisher = "Elsevier Ltd",
number = "2",
}