Publications

Cornell, S., Boeddeker, C., Park, T., Huang, H., Raj, D., Wiesner, M., Masuyama, Y., Chang, X., Wang, Z.-Q., Squartini, S., Garcia, P., Watanabe, S., "Recent Trends in Distant Conversational Speech Recognition: A Review of CHiME-7 and 8 DASR Challenges", Computer Speech & Language, DOI: 10.1016/j.csl.2025.101901, Vol. 97, pp. 101901, December 2025.
BibTeX TR2026-008 PDF
- @article{Cornell2025dec,
- author = {Cornell, Samuele and Boeddeker, Christoph and Park, Taejin and Huang, He and Raj, Desh and Wiesner, Matthew and Masuyama, Yoshiki and Chang, Xuankai and Wang, Zhong-Qiu and Squartini, Stefano and Garcia, Paola and Watanabe, Shinji},
- title = {{Recent Trends in Distant Conversational Speech Recognition: A Review of CHiME-7 and 8 DASR Challenges}},
- journal = {Computer Speech \& Language},
- year = 2025,
- volume = 97,
- pages = 101901,
- month = dec,
- doi = {10.1016/j.csl.2025.101901},
- url = {https://www.merl.com/publications/TR2026-008}
- }
Hussein, A., Khurana, S., Wichern, G., Germain, F.G., Le Roux, J., "HASRD: Hierarchical Acoustic and Semantic Representation Disentanglement", Interspeech, DOI: 10.21437/Interspeech.2025-2063, August 2025, pp. 5393-5397.
BibTeX TR2025-122 PDF
- @inproceedings{Hussein2025aug,
- author = {Hussein, Amir and Khurana, Sameer and Wichern, Gordon and Germain, François G and {Le Roux}, Jonathan},
- title = {{HASRD: Hierarchical Acoustic and Semantic Representation Disentanglement}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {5393--5397},
- month = aug,
- publisher = {ISCA},
- doi = {10.21437/Interspeech.2025-2063},
- url = {https://www.merl.com/publications/TR2025-122}
- }
Masuyama, Y., "Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition," Tech. Rep. TR2025-097, Jelinek Summer Workshop on Speech and Language Technology (JSALT), June 2025.
BibTeX TR2025-097 PDF
- @techreport{Masuyama2025jun,
- author = {{{Masuyama, Yoshiki}}},
- title = {{{Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition}}},
- institution = {Jelinek Summer Workshop on Speech and Language Technology (JSALT)},
- year = 2025,
- month = jun,
- url = {https://www.merl.com/publications/TR2025-097}
- }
Masuyama, Y., Chang, X., Zhang, W., Cornell, S., Wang, Z.-Q., Ono, N., Qian, Y., Watanabe, S., "An End-to-End Integration of Speech Separation and Recognition with Self-Supervised Learning Representation", Computer Speech & Language, DOI: 10.1016/j.csl.2025.101813, Vol. 95, pp. 101813, May 2025.
BibTeX TR2025-054 PDF
- @article{Masuyama2025may,
- author = {Masuyama, Yoshiki and Chang, Xuankai and Zhang, Wangyou and Cornell, Samuele and Wang, Zhong-Qiu and Ono, Nobutaka and Qian, Yanmin and Watanabe, Shinji},
- title = {{An End-to-End Integration of Speech Separation and Recognition with Self-Supervised Learning Representation}},
- journal = {Computer Speech \& Language},
- year = 2025,
- volume = 95,
- pages = 101813,
- month = may,
- doi = {10.1016/j.csl.2025.101813},
- issn = {0885-2308},
- url = {https://www.merl.com/publications/TR2025-054}
- }
Liu, J., Wang, Y., Koike-Akino, T., Nakai, T., Oonishi, K., Higashi, T., "MEL-PETs Defense for the NeurIPS 2024 LLM Privacy Challenge Blue Team Track", LLM Privacy Challenge at Neural Information Processing Systems (NeurIPS) 2024, December 2024.
BibTeX TR2024-166 PDF Video Software Presentation
- @inproceedings{Liu2024dec,
- author = {Liu, Jing and Wang, Ye and Koike-Akino, Toshiaki and Nakai, Tsunato and Oonishi, Kento and Higashi, Takuya},
- title = {{MEL-PETs Defense for the NeurIPS 2024 LLM Privacy Challenge Blue Team Track}},
- booktitle = {LLM Privacy Challenge at Neural Information Processing Systems (NeurIPS) 2024},
- year = 2024,
- month = dec,
- url = {https://www.merl.com/publications/TR2024-166}
- }
Mitsui, Y., Aihara, R., Hori, T., Le Roux, J., Taguchi, S., "Exploring Keyword Enrollment for Japanese End-to-End Automatic Speech Recognition using Contextual Biasing", OTOGAKU Symposium, June 2024.
BibTeX TR2024-073 PDF
- @inproceedings{Mitsui2024jun,
- author = {Mitsui, Yoshiki and Aihara, Ryo and Hori, Takaaki and {Le Roux}, Jonathan and Taguchi, Shinya},
- title = {{Exploring Keyword Enrollment for Japanese End-to-End Automatic Speech Recognition using Contextual Biasing}},
- booktitle = {OTOGAKU Symposium},
- year = 2024,
- month = jun,
- publisher = {Information Processing Society of Japan},
- issn = {2188-8663},
- url = {https://www.merl.com/publications/TR2024-073}
- }
Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2022.3195367, Vol. 16, No. 6, pp. 1424-1438, September 2022.
BibTeX TR2022-112 PDF
- @article{Higuchi2022sep,
- author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
- title = {{Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2022,
- volume = 16,
- number = 6,
- pages = {1424--1438},
- month = sep,
- doi = {10.1109/JSTSP.2022.3195367},
- issn = {1941-0484},
- url = {https://www.merl.com/publications/TR2022-112}
- }
Chang, X., Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9747375, April 2022, pp. 7322-7326.
BibTeX TR2022-021 PDF
- @inproceedings{Chang2022apr,
- author = {Chang, Xuankai and Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7322--7326},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9747375},
- url = {https://www.merl.com/publications/TR2022-021}
- }
Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9746275, April 2022, pp. 7672-7676.
BibTeX TR2022-026 PDF
- @inproceedings{Higuchi2022apr,
- author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
- title = {{Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7672--7676},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9746275},
- url = {https://www.merl.com/publications/TR2022-026}
- }
Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Sequence Transduction with Graph-based Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9747788, April 2022, pp. 7212-7216.
BibTeX TR2022-024 PDF
- @inproceedings{Moritz2022apr,
- author = {Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Sequence Transduction with Graph-based Supervision}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7212--7216},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9747788},
- url = {https://www.merl.com/publications/TR2022-024}
- }
Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2021-571, September 2021, pp. 726-730.
BibTeX TR2021-103 PDF
- @inproceedings{Higuchi2021sep,
- author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
- title = {{Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition}},
- booktitle = {Interspeech},
- year = 2021,
- pages = {726--730},
- month = sep,
- doi = {10.21437/Interspeech.2021-571},
- url = {https://www.merl.com/publications/TR2021-103}
- }
Hori, T., Moritz, N., Hori, C., Le Roux, J., "Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers", Interspeech, DOI: 10.21437/Interspeech.2021-1643, August 2021, pp. 2097-2101.
BibTeX TR2021-100 PDF
- @inproceedings{Hori2021aug3,
- author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and {Le Roux}, Jonathan},
- title = {{Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers}},
- booktitle = {Interspeech},
- year = 2021,
- pages = {2097--2101},
- month = aug,
- doi = {10.21437/Interspeech.2021-1643},
- url = {https://www.merl.com/publications/TR2021-100}
- }
Moritz, N., Hori, T., Le Roux, J., "Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2021-1693, August 2021, pp. 1822-1826.
BibTeX TR2021-094 PDF
- @inproceedings{Moritz2021aug,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition}},
- booktitle = {Interspeech},
- year = 2021,
- pages = {1822--1826},
- month = aug,
- doi = {10.21437/Interspeech.2021-1693},
- url = {https://www.merl.com/publications/TR2021-094}
- }
Moritz, N., Hori, T., Le Roux, J., "Capturing Multi-Resolution Context by Dilated Self-Attention", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9415001, June 2021, pp. 5869-5873.
BibTeX TR2021-036 PDF
- @inproceedings{Moritz2021jun,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Capturing Multi-Resolution Context by Dilated Self-Attention}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {5869--5873},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9415001},
- url = {https://www.merl.com/publications/TR2021-036}
- }
Khurana, S., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9414299, June 2021, pp. 6553-6557.
BibTeX TR2021-039 PDF
- @inproceedings{Khurana2021jun,
- author = {Khurana, Sameer and Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {6553--6557},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9414299},
- url = {https://www.merl.com/publications/TR2021-039}
- }
Moritz, N., Hori, T., Le Roux, J., "Semi-Supervised Speech Recognition via Graph-Based Temporal Classification", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9414058, June 2021, pp. 6548-6552.
BibTeX TR2021-037 PDF
- @inproceedings{Moritz2021jun2,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Semi-Supervised Speech Recognition via Graph-Based Temporal Classification}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {6548--6552},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9414058},
- url = {https://www.merl.com/publications/TR2021-037}
- }
Hori, T., Moritz, N., Hori, C., Le Roux, J., "Transformer-based Long-context End-to-end Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2020-2928, October 2020, pp. 5011-5015.
BibTeX TR2020-139 PDF Presentation
- @inproceedings{Hori2020oct,
- author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and {Le Roux}, Jonathan},
- title = {{Transformer-based Long-context End-to-end Speech Recognition}},
- booktitle = {Interspeech},
- year = 2020,
- pages = {5011--5015},
- month = oct,
- doi = {10.21437/Interspeech.2020-2928},
- issn = {1990-9772},
- url = {https://www.merl.com/publications/TR2020-139}
- }
Jayashankar, T., Le Roux, J., Moulin, P., "Detecting Audio Attacks on ASR Systems with Dropout Uncertainty", Interspeech, DOI: 10.21437/Interspeech.2020-1846, October 2020, pp. 4671-4675.
BibTeX TR2020-137 PDF Presentation
- @inproceedings{Jayashankar2020oct,
- author = {Jayashankar, Tejas and {Le Roux}, Jonathan and Moulin, Pierre},
- title = {{Detecting Audio Attacks on ASR Systems with Dropout Uncertainty}},
- booktitle = {Interspeech},
- year = 2020,
- pages = {4671--4675},
- month = oct,
- doi = {10.21437/Interspeech.2020-1846},
- issn = {1990-9772},
- url = {https://www.merl.com/publications/TR2020-137}
- }
Moritz, N., Wichern, G., Hori, T., Le Roux, J., "All-in-One Transformer: Unifying Speech Recognition, Audio Tagging, and Event Detection", Interspeech, DOI: 10.21437/Interspeech.2020-2757, October 2020, pp. 3112-3116.
BibTeX TR2020-138 PDF Presentation
- @inproceedings{Moritz2020oct,
- author = {Moritz, Niko and Wichern, Gordon and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{All-in-One Transformer: Unifying Speech Recognition, Audio Tagging, and Event Detection}},
- booktitle = {Interspeech},
- year = 2020,
- pages = {3112--3116},
- month = oct,
- doi = {10.21437/Interspeech.2020-2757},
- issn = {1990-9772},
- url = {https://www.merl.com/publications/TR2020-138}
- }
Moritz, N., Hori, T., Le Roux, J., "Streaming Automatic Speech Recognition With The Transformer Model", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9054476, April 2020, pp. 6074-6078.
BibTeX TR2020-040 PDF Video Presentation
- @inproceedings{Moritz2020apr,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Streaming Automatic Speech Recognition With The Transformer Model}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2020,
- pages = {6074--6078},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP40776.2020.9054476},
- issn = {2379-190X},
- isbn = {978-1-5090-6631-5},
- url = {https://www.merl.com/publications/TR2020-040}
- }
Sari, L., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Speaker Adaptation Using Attention-Based Speaker Memory For End-To-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9054249, April 2020, pp. 7384-7388.
BibTeX TR2020-037 PDF Video Presentation
- @inproceedings{Sari2020apr,
- author = {Sari, Leda and Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Unsupervised Speaker Adaptation Using Attention-Based Speaker Memory For End-To-End ASR}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2020,
- pages = {7384--7388},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP40776.2020.9054249},
- issn = {2379-190X},
- isbn = {978-1-5090-6631-5},
- url = {https://www.merl.com/publications/TR2020-037}
- }
Li, R., Wang, X., Mallidi, H., Watanabe, S., Hori, T., Hermansky, H., "Multi-Stream End-to-End Speech Recognition", IEEE/ACM Transactions on Audio, Speech and Language Processing, DOI: 10.1109/TASLP.2019.2959721, Vol. 28, pp. 646-655, March 2020.
BibTeX TR2020-030 PDF
- @article{Li2020mar,
- author = {Li, Ruizhi and Wang, Xiaofei and Mallidi, Harish and Watanabe, Shinji and Hori, Takaaki and Hermansky, Hynek},
- title = {{Multi-Stream End-to-End Speech Recognition}},
- journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
- year = 2020,
- volume = 28,
- pages = {646--655},
- month = mar,
- doi = {10.1109/TASLP.2019.2959721},
- url = {https://www.merl.com/publications/TR2020-030}
- }
Chang, X., Zhang, W., Qian, Y., Le Roux, J., Watanabe, S., "MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 237-144.
BibTeX TR2019-157 PDF
- @inproceedings{Chang2019dec,
- author = {Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and {Le Roux}, Jonathan and Watanabe, Shinji},
- title = {{MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2019,
- pages = {237--144},
- month = dec,
- isbn = {978-1-7281-0305-1},
- url = {https://www.merl.com/publications/TR2019-157}
- }
Karita, S., Chen, N., Hayashi, T., Hori, T., Inaguma, H., Jiang, Z., Someki, M., Enrique Yalta Soplin, N., Yamamoto, R., Wang, X., Watanabe, S., Yoshimura, T., Zhang, W., "A Comparative Study on Transformer Vs RNN in Speech Applications", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU46091.2019.9003750, December 2019, pp. 449-456.
BibTeX TR2019-158 PDF
- @inproceedings{Karita2019dec,
- author = {Karita, Shigeki and Chen, Nanxin and Hayashi, Tomoki and Hori, Takaaki and Inaguma, Hirofumi and Jiang, Ziyan and Someki, Masao and Enrique Yalta Soplin, Nelson and Yamamoto, Ryuichi and Wang, Xiaofei and Watanabe, Shinji and Yoshimura, Takenori and Zhang, Wangyou},
- title = {{A Comparative Study on Transformer Vs RNN in Speech Applications}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2019,
- pages = {449--456},
- month = dec,
- doi = {10.1109/ASRU46091.2019.9003750},
- url = {https://www.merl.com/publications/TR2019-158}
- }
Moritz, N., Hori, T., Le Roux, J., "Streaming End-to-End Speech Recognition with Joint CTC-Attention Based Models", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 936-943.
BibTeX TR2019-159 PDF
- @inproceedings{Moritz2019dec,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Streaming End-to-End Speech Recognition with Joint CTC-Attention Based Models}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2019,
- pages = {936--943},
- month = dec,
- isbn = {978-1-7281-0305-1},
- url = {https://www.merl.com/publications/TR2019-159}
- }