- Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement", International Workshop on Acoustic Signal Enhancement (IWAENC), September 2024.
BibTeX TR2024-126 PDF Software- @inproceedings{Saijo2024sep2,
- author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and Le Roux, Jonathan}},
- title = {TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement},
- booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
- year = 2024,
- month = sep,
- url = {https://www.merl.com/publications/TR2024-126}
- }
- Bahrman, L., Fontaine, M., Le Roux, J., Richard, G., "Speech Dereverberation Constrained on Room Impulse Response Characteristics", Interspeech, DOI: 10.21437/Interspeech.2024-1173, September 2024, pp. 622-626.
BibTeX TR2024-121 PDF- @inproceedings{Bahrman2024sep,
- author = {Bahrman, Louis and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël}},
- title = {Speech Dereverberation Constrained on Room Impulse Response Characteristics},
- booktitle = {Interspeech},
- year = 2024,
- pages = {622--626},
- month = sep,
- doi = {10.21437/Interspeech.2024-1173},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-121}
- }
- Khurana, S., Hori, C., Laurent, A., Wichern, G., Le Roux, J., "ZeroST: Zero-Shot Speech Translation", Interspeech, DOI: 10.21437/Interspeech.2024-1088, September 2024, pp. 392-396.
BibTeX TR2024-122 PDF- @inproceedings{Khurana2024sep,
- author = {Khurana, Sameer and Hori, Chiori and Laurent, Antoine and Wichern, Gordon and Le Roux, Jonathan}},
- title = {ZeroST: Zero-Shot Speech Translation},
- booktitle = {Interspeech},
- year = 2024,
- pages = {392--396},
- month = sep,
- doi = {10.21437/Interspeech.2024-1088},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-122}
- }
- Pan, Z., Wichern, G., Germain, F.G., Saijo, K., Le Roux, J., "PARIS: Pseudo-AutoRegressIve Siamese Training for Online Speech Separation", Interspeech, DOI: 10.21437/Interspeech.2024-1066, September 2024, pp. 582-586.
BibTeX TR2024-124 PDF- @inproceedings{Pan2024sep,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Saijo, Kohei and Le Roux, Jonathan}},
- title = {PARIS: Pseudo-AutoRegressIve Siamese Training for Online Speech Separation},
- booktitle = {Interspeech},
- year = 2024,
- pages = {582--586},
- month = sep,
- doi = {10.21437/Interspeech.2024-1066},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-124}
- }
- Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "Enhanced Reverberation as Supervision for Unsupervised Speech Separation", Interspeech, DOI: 10.21437/Interspeech.2024-1241, September 2024, pp. 607-611.
BibTeX TR2024-116 PDF Software- @inproceedings{Saijo2024sep,
- author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and Le Roux, Jonathan}},
- title = {Enhanced Reverberation as Supervision for Unsupervised Speech Separation},
- booktitle = {Interspeech},
- year = 2024,
- pages = {607--611},
- month = sep,
- doi = {10.21437/Interspeech.2024-1241},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-116}
- }
- Mitsui, Y., Aihara, R., Hori, T., Le Roux, J., Taguchi, S., "Exploring Keyword Enrollment for Japanese End-to-End Automatic Speech Recognition using Contextual Biasing", OTOGAKU Symposium, June 2024.
BibTeX TR2024-073 PDF- @inproceedings{Mitsui2024jun,
- author = {{Mitsui, Yoshiki and Aihara, Ryo and Hori, Takaaki and Le Roux, Jonathan and Taguchi, Shinya}},
- title = {Exploring Keyword Enrollment for Japanese End-to-End Automatic Speech Recognition using Contextual Biasing},
- booktitle = {OTOGAKU Symposium},
- year = 2024,
- month = jun,
- publisher = {Information Processing Society of Japan},
- issn = {2188-8663},
- url = {https://www.merl.com/publications/TR2024-073}
- }
- Kambara, M., Hori, C., Sugiura, K., Ota, K., Jha, D.K., Khurana, S., Jain, S., Corcodel, R., Romeres, D., Le Roux, J., "Human Action Understanding-based Robot Planning using Multimodal LLM", IEEE International Conference on Robotics and Automation (ICRA), June 2024.
BibTeX TR2024-066 PDF- @inproceedings{Kambara2024jun,
- author = {Kambara, Motonari and Hori, Chiori and Sugiura, Komei and Ota, Kei and Jha, Devesh K. and Khurana, Sameer and Jain, Siddarth and Corcodel, Radu and Romeres, Diego and Le Roux, Jonathan}},
- title = {Human Action Understanding-based Robot Planning using Multimodal LLM},
- booktitle = {IEEE International Conference on Robotics and Automation (ICRA) Workshop},
- year = 2024,
- month = jun,
- url = {https://www.merl.com/publications/TR2024-066}
- }
- Pan, Z., Wichern, G., Germain, F.G., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", Hands-free Speech Communication and Microphone Arrays (HSCMA), DOI: 10.1109/ICASSPW62465.2024.10626914, April 2024, pp. 174-178.
BibTeX TR2024-029 PDF- @inproceedings{Pan2024apr,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Subramanian, Aswin and Le Roux, Jonathan},
- title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
- booktitle = {Hands-free Speech Communication and Microphone Arrays (HSCMA)},
- year = 2024,
- pages = {174--178},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSPW62465.2024.10626914},
- isbn = {979-8-3503-7451-3},
- url = {https://www.merl.com/publications/TR2024-029}
- }
- Fujihashi, T., Kato, S., Koike-Akino, T., "Implicit Neural Representation for Low-Overhead Graph-Based Holographic-Type Communications", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10445857, April 2024.
BibTeX TR2024-022 PDF- @inproceedings{Fujihashi2024apr,
- author = {Fujihashi, Takuya and Kato, Sorachi and Koike-Akino, Toshiaki},
- title = {Implicit Neural Representation for Low-Overhead Graph-Based Holographic-Type Communications},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10445857},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-022}
- }
- Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), April 2024.
BibTeX TR2024-032 PDF- @inproceedings{Koo2024apr,
- author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
- title = {Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads},
- booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
- year = 2024,
- month = apr,
- url = {https://www.merl.com/publications/TR2024-032}
- }
- Jeon, C.-B., Wichern, G., Germain, F.G., Le Roux, J., "Why does music source separation benefit from cacophony?", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), DOI: 10.1109/ICASSPW62465.2024.10669899, March 2024, pp. 873-877.
BibTeX TR2024-030 PDF Video- @inproceedings{Jeon2024mar,
- author = {Jeon, Chang-Bin and Wichern, Gordon and Germain, François G and Le Roux, Jonathan},
- title = {Why does music source separation benefit from cacophony?},
- booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
- year = 2024,
- pages = {873--877},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSPW62465.2024.10669899},
- isbn = {979-8-3503-7451-3},
- url = {https://www.merl.com/publications/TR2024-030}
- }
- Bralios, D., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "Generation or Replication: Auscultating Audio Latent Diffusion Models", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447705, March 2024, pp. 1156-1160.
BibTeX TR2024-027 PDF- @inproceedings{Bralios2024mar,
- author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {1156--1160},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10447705},
- url = {https://www.merl.com/publications/TR2024-027}
- }
- Fernandez-Menduina, S., Rapp, J., Mansour, H., Greiff, M., Parsons, K., "Tracking Beyond the Unambiguous Range with Modulo Single-Photon Lidar", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446835, March 2024, pp. 6-10.
BibTeX TR2024-021 PDF- @inproceedings{Fernandez-Menduina2024mar,
- author = {Fernandez-Menduina, Samuel and Rapp, Joshua and Mansour, Hassan and Greiff, Marcus and Parsons, Kieran},
- title = {Tracking Beyond the Unambiguous Range with Modulo Single-Photon Lidar},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {6--10},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10446835},
- url = {https://www.merl.com/publications/TR2024-021}
- }
- Masuyama, Y., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10448477, March 2024, pp. 1016-1020.
BibTeX TR2024-026 PDF Software- @inproceedings{Masuyama2024mar,
- author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {1016--1020},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10448477},
- url = {https://www.merl.com/publications/TR2024-026}
- }
- Pan, Z., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446333, March 2024, pp. 11456-11460.
BibTeX TR2024-025 PDF- @inproceedings{Pan2024mar,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
- title = {NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {11456--11460},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10446333},
- url = {https://www.merl.com/publications/TR2024-025}
- }
- Sholokhov, A., Rapp, J., Nabi, S., Brunton, S., Kutz, N., Mansour, H., "Single-pixel imaging of dynamic flows using Neural ODE regularization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447584, March 2024, pp. 2530-2534.
BibTeX TR2024-024 PDF- @inproceedings{Sholokhov2024mar,
- author = {Sholokhov, Aleksei and Rapp, Joshua and Nabi, Saleh and Brunton, Steven and Kutz, Nathan and Mansour, Hassan},
- title = {Single-pixel imaging of dynamic flows using Neural ODE regularization},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {2530--2534},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10447584},
- url = {https://www.merl.com/publications/TR2024-024}
- }
- Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447215, March 2024, pp. 316-320.
BibTeX TR2024-028 PDF- @inproceedings{Wu2024mar,
- author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
- title = {Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {316--320},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10447215},
- url = {https://www.merl.com/publications/TR2024-028}
- }
- Kato, S., Wang, P., Koike-Akino, T., Fujihashi, T., Mansour, H., Boufounos, P.T., "Object Trajectory Estimation with Multi-Band Wi-Fi Neural Dynamic Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10445972, March 2024, pp. 13261-13265.
BibTeX TR2024-019 PDF- @inproceedings{Kato2024mar,
- author = {Kato, Sorachi and Wang, Pu and Koike-Akino, Toshiaki and Fujihashi, Takuya and Mansour, Hassan and Boufounos, Petros T.},
- title = {Object Trajectory Estimation with Multi-Band Wi-Fi Neural Dynamic Fusion},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13261--13265},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10445972},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-019}
- }
- Wang, P., Boufounos, P.T., "Monostatic DMG Passive Sensing with Hypothesis Testing", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447134, March 2024, pp. 13381-13385.
BibTeX TR2024-020 PDF- @inproceedings{Wang2024mar,
- author = {Wang, Pu and Boufounos, Petros T.},
- title = {Monostatic DMG Passive Sensing with Hypothesis Testing},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13381--13385},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10447134},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-020}
- }
- Yataka, R., Wang, P., Boufounos, P.T., Takahashi, R., "Radar Perception with Scalable Connective Temporal Relations for Autonomous Driving", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446449, March 2024, pp. 13266-13270.
BibTeX TR2024-023 PDF- @inproceedings{Yataka2024mar,
- author = {Yataka, Ryoma and Wang, Pu and Boufounos, Petros T. and Takahashi, Ryuhei},
- title = {Radar Perception with Scalable Connective Temporal Relations for Autonomous Driving},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13266--13270},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10446449},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-023}
- }
- Baoueb, T., Liu, H., Fontaine, M., Le Roux, J., Richard, G., "SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446830, March 2024, pp. 986-990.
BibTeX TR2024-013 PDF- @inproceedings{Baoueb2024mar,
- author = {Baoueb, Teysir and Liu, Haocheng and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël},
- title = {SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {986--990},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10446830},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-013}
- }
- Hori, C., Wang, P., Rahman, M., Vaca-Rubio, C., Khurana, S., Cherian, A., Le Roux, J., "Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447600, March 2024, pp. 13296-13300.
BibTeX TR2024-012 PDF- @inproceedings{Hori2024mar,
- author = {Hori, Chiori and Wang, Pu and Rahman, Mahbub and Vaca-Rubio, Cristian and Khurana, Sameer and Cherian, Anoop and Le Roux, Jonathan},
- title = {Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13296--13300},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10447600},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-012}
- }
- Liu, H., Baoueb, T., Fontaine, M., Le Roux, J., Richard, G., "GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446058, March 2024, pp. 11611-11615.
BibTeX TR2024-014 PDF- @inproceedings{Liu2024mar,
- author = {Liu, Haocheng and Baoueb, Teysir and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël},
- title = {GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {11611--11615},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10446058},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-014}
- }
- Boeddeker, C., Subramanian, A.S., Wichern, G., Haeb-Umbach, R., Le Roux, J., "TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2024.3350887, Vol. 32, pp. 1185-1197, February 2024.
BibTeX TR2024-006 PDF Software- @article{Boeddeker2024feb,
- author = {Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan},
- title = {TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2024,
- volume = 32,
- pages = {1185--1197},
- month = feb,
- doi = {10.1109/TASLP.2024.3350887},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2024-006}
- }
- Pan, Z., Wichern, G., Masuyama, Y., Germain, F.G., Khurana, S., Hori, C., Le Roux, J., "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU57964.2023.10389618, December 2023.
BibTeX TR2023-152 PDF Video- @inproceedings{Pan2023dec2,
- author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2023,
- month = dec,
- doi = {10.1109/ASRU57964.2023.10389618},
- isbn = {979-8-3503-0689-7},
- url = {https://www.merl.com/publications/TR2023-152}
- }