forked from facebookresearch/fairseq2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
181 lines (164 loc) · 9.28 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
@misc{https://doi.org/10.48550/arxiv.1608.03983,
title = {SGDR: Stochastic Gradient Descent with Warm Restarts},
author = {Ilya Loshchilov and Frank Hutter},
year = {2017},
eprint = {1608.03983},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@misc{https://doi.org/10.48550/arxiv.1607.06450,
title={Layer Normalization},
author={Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton},
year={2016},
eprint={1607.06450},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{https://doi.org/10.48550/arxiv.1706.03762,
doi = {10.48550/ARXIV.1706.03762},
url = {https://arxiv.org/abs/1706.03762},
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Attention Is All You Need},
publisher = {arXiv},
year = {2017},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.1901.02860,
title={Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
author={Zihang Dai and Zhilin Yang and Yiming Yang and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},
year={2019},
eprint={1901.02860},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{https://doi.org/10.48550/arxiv.1909.11556,
doi = {10.48550/ARXIV.1909.11556},
url = {https://arxiv.org/abs/1909.11556},
author = {Fan, Angela and Grave, Edouard and Joulin, Armand},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Reducing Transformer Depth on Demand with Structured Dropout},
publisher = {arXiv},
year = {2019},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.1910.07467,
title={Root Mean Square Layer Normalization},
author={Biao Zhang and Rico Sennrich},
year={2019},
eprint={1910.07467},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{https://doi.org/10.48550/arxiv.1911.08460,
doi = {10.48550/ARXIV.1911.08460},
url = {https://arxiv.org/abs/1911.08460},
author = {Synnaeve, Gabriel and Xu, Qiantong and Kahn, Jacob and Likhomanenko, Tatiana and Grave, Edouard and Pratap, Vineel and Sriram, Anuroop and Liptchinsky, Vitaliy and Collobert, Ronan},
keywords = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
title = {End-to-end ASR: from Supervised to Semi-Supervised Learning with Modern Architectures},
publisher = {arXiv},
year = {2019},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.2002.04745,
doi = {10.48550/ARXIV.2002.04745},
url = {https://arxiv.org/abs/2002.04745},
author = {Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {On Layer Normalization in the Transformer Architecture},
publisher = {arXiv},
year = {2020},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.2005.08100,
doi = {10.48550/ARXIV.2005.08100},
url = {https://arxiv.org/abs/2005.08100},
author = {Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
keywords = {Audio and Speech Processing (eess.AS), Machine Learning (cs.LG), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Conformer: Convolution-augmented Transformer for Speech Recognition},
publisher = {arXiv},
year = {2020},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.2006.11477,
title = {wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
author = {Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
year = {2020},
eprint = {2006.11477},
archivePrefix = {arXiv},
primaryClass = {cs.CL}
}
@misc{https://doi.org/10.48550/arxiv.2010.05171,
doi = {10.48550/ARXIV.2010.05171},
url = {https://arxiv.org/abs/2010.05171},
author = {Wang, Changhan and Tang, Yun and Ma, Xutai and Wu, Anne and Popuri, Sravya and Okhonko, Dmytro and Pino, Juan},
keywords = {Computation and Language (cs.CL), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq},
publisher = {arXiv},
year = {2020},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.2104.09864,
doi = {10.48550/ARXIV.2104.09864},
url = {https://arxiv.org/abs/2104.09864},
author = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {RoFormer: Enhanced Transformer with Rotary Position Embedding},
publisher = {arXiv},
year = {2021},
copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}
}
@misc{https://doi.org/10.48550/arxiv.2108.06209,
title = {W2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training},
author = {Yu-An Chung and Yu Zhang and Wei Han and Chung-Cheng Chiu and James Qin and Ruoming Pang and Yonghui Wu},
year = {2021},
eprint = {2108.06209},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@misc{https://doi.org/10.48550/arxiv.2108.12409,
doi = {10.48550/ARXIV.2108.12409},
url = {https://arxiv.org/abs/2108.12409},
author = {Press, Ofir and Smith, Noah A. and Lewis, Mike},
keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
publisher = {arXiv},
year = {2021},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.2110.09456,
doi = {10.48550/ARXIV.2110.09456},
url = {https://arxiv.org/abs/2110.09456},
author = {Shleifer, Sam and Weston, Jason and Ott, Myle},
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {NormFormer: Improved Transformer Pretraining with Extra Normalization},
publisher = {arXiv},
year = {2021},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@misc{https://doi.org/10.48550/arxiv.2207.00952,
title={M-Adapter: Modality Adaptation for End-to-End Speech-to-Text Translation},
author={Jinming Zhao and Hao Yang and Ehsan Shareghi and Gholamreza Haffari},
year={2022},
eprint={2207.00952},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{https://doi.org/10.48550/arxiv.2207.04672,
doi = {10.48550/arxiv.2207.04672},
url = {https://arxiv.org/abs/2207.04672},
title={No Language Left Behind: Scaling Human-Centered Machine Translation},
author={NLLB Team and Marta R. Costa-jussà and James Cross and Onur Çelebi and Maha Elbayad and Kenneth Heafield and Kevin Heffernan and Elahe Kalbassi and Janice Lam and Daniel Licht and Jean Maillard and Anna Sun and Skyler Wang and Guillaume Wenzek and Al Youngblood and Bapi Akula and Loic Barrault and Gabriel Mejia Gonzalez and Prangthip Hansanti and John Hoffman and Semarley Jarrett and Kaushik Ram Sadagopan and Dirk Rowe and Shannon Spruit and Chau Tran and Pierre Andrews and Necip Fazil Ayan and Shruti Bhosale and Sergey Edunov and Angela Fan and Cynthia Gao and Vedanuj Goswami and Francisco Guzmán and Philipp Koehn and Alexandre Mourachko and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Jeff Wang},
year={2022},
eprint={2207.04672},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{https://doi.org/10.48550/arxiv.2212.08055,
title={UnitY: Two-pass Direct Speech-to-speech Translation with Discrete Units},
author={Hirofumi Inaguma and Sravya Popuri and Ilia Kulikov and Peng-Jen Chen and Changhan Wang and Yu-An Chung and Yun Tang and Ann Lee and Shinji Watanabe and Juan Pino},
year={2023},
eprint={2212.08055},
archivePrefix={arXiv},
primaryClass={cs.CL}
}