bibliography.bib

@misc{https://doi.org/10.48550/arxiv.1608.03983,
  title = {SGDR: Stochastic Gradient Descent with Warm Restarts},
  author = {Ilya Loshchilov and Frank Hutter},
  year = {2017},
  eprint = {1608.03983},
  archivePrefix = {arXiv},
  primaryClass = {cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.1607.06450,
  title={Layer Normalization},
  author={Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton},
  year={2016},
  eprint={1607.06450},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@misc{https://doi.org/10.48550/arxiv.1706.03762,
  doi = {10.48550/ARXIV.1706.03762},
  url = {https://arxiv.org/abs/1706.03762},
  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
  keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {Attention Is All You Need},
  publisher = {arXiv},
  year = {2017},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.1901.02860,
  title={Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context}, 
  author={Zihang Dai and Zhilin Yang and Yiming Yang and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},
  year={2019},
  eprint={1901.02860},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.1909.11556,
  doi = {10.48550/ARXIV.1909.11556},
  url = {https://arxiv.org/abs/1909.11556},
  author = {Fan, Angela and Grave, Edouard and Joulin, Armand},
  keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {Reducing Transformer Depth on Demand with Structured Dropout},
  publisher = {arXiv},
  year = {2019},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.1910.07467,
  title={Root Mean Square Layer Normalization}, 
  author={Biao Zhang and Rico Sennrich},
  year={2019},
  eprint={1910.07467},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.1911.08460,
  doi = {10.48550/ARXIV.1911.08460},
  url = {https://arxiv.org/abs/1911.08460},
  author = {Synnaeve, Gabriel and Xu, Qiantong and Kahn, Jacob and Likhomanenko, Tatiana and Grave, Edouard and Pratap, Vineel and Sriram, Anuroop and Liptchinsky, Vitaliy and Collobert, Ronan},
  keywords = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
  title = {End-to-end ASR: from Supervised to Semi-Supervised Learning with Modern Architectures},
  publisher = {arXiv},
  year = {2019},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2002.04745,
  doi = {10.48550/ARXIV.2002.04745},
  url = {https://arxiv.org/abs/2002.04745},
  author = {Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
  keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {On Layer Normalization in the Transformer Architecture},
  publisher = {arXiv},
  year = {2020},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2005.08100,
  doi = {10.48550/ARXIV.2005.08100},
  url = {https://arxiv.org/abs/2005.08100},
  author = {Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
  keywords = {Audio and Speech Processing (eess.AS), Machine Learning (cs.LG), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {Conformer: Convolution-augmented Transformer for Speech Recognition},
  publisher = {arXiv},
  year = {2020},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2006.11477,
   title = {wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, 
   author = {Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
   year = {2020},
   eprint = {2006.11477},
   archivePrefix = {arXiv},
   primaryClass = {cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2010.05171,
  doi = {10.48550/ARXIV.2010.05171},
  url = {https://arxiv.org/abs/2010.05171},
  author = {Wang, Changhan and Tang, Yun and Ma, Xutai and Wu, Anne and Popuri, Sravya and Okhonko, Dmytro and Pino, Juan},
  keywords = {Computation and Language (cs.CL), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
  title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq},
  publisher = {arXiv},
  year = {2020},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2104.09864,
  doi = {10.48550/ARXIV.2104.09864},
  url = {https://arxiv.org/abs/2104.09864},
  author = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RoFormer: Enhanced Transformer with Rotary Position Embedding},
  publisher = {arXiv},
  year = {2021},
  copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}
}

@misc{https://doi.org/10.48550/arxiv.2108.06209,
   title = {W2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training}, 
   author = {Yu-An Chung and Yu Zhang and Wei Han and Chung-Cheng Chiu and James Qin and Ruoming Pang and Yonghui Wu},
   year = {2021},
   eprint = {2108.06209},
   archivePrefix = {arXiv},
   primaryClass = {cs.LG}
}

@misc{https://doi.org/10.48550/arxiv.2108.12409,
  doi = {10.48550/ARXIV.2108.12409},
  url = {https://arxiv.org/abs/2108.12409},
  author = {Press, Ofir and Smith, Noah A. and Lewis, Mike},
  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
  publisher = {arXiv},
  year = {2021},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2110.09456,
  doi = {10.48550/ARXIV.2110.09456},
  url = {https://arxiv.org/abs/2110.09456},
  author = {Shleifer, Sam and Weston, Jason and Ott, Myle},
  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {NormFormer: Improved Transformer Pretraining with Extra Normalization},
  publisher = {arXiv},
  year = {2021},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{https://doi.org/10.48550/arxiv.2207.00952,
    title={M-Adapter: Modality Adaptation for End-to-End Speech-to-Text Translation}, 
    author={Jinming Zhao and Hao Yang and Ehsan Shareghi and Gholamreza Haffari},
    year={2022},
    eprint={2207.00952},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2207.04672,
   doi = {10.48550/arxiv.2207.04672},
   url = {https://arxiv.org/abs/2207.04672},
   title={No Language Left Behind: Scaling Human-Centered Machine Translation},
   author={NLLB Team and Marta R. Costa-jussà and James Cross and Onur Çelebi and Maha Elbayad and Kenneth Heafield and Kevin Heffernan and Elahe Kalbassi and Janice Lam and Daniel Licht and Jean Maillard and Anna Sun and Skyler Wang and Guillaume Wenzek and Al Youngblood and Bapi Akula and Loic Barrault and Gabriel Mejia Gonzalez and Prangthip Hansanti and John Hoffman and Semarley Jarrett and Kaushik Ram Sadagopan and Dirk Rowe and Shannon Spruit and Chau Tran and Pierre Andrews and Necip Fazil Ayan and Shruti Bhosale and Sergey Edunov and Angela Fan and Cynthia Gao and Vedanuj Goswami and Francisco Guzmán and Philipp Koehn and Alexandre Mourachko and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Jeff Wang},
   year={2022},
   eprint={2207.04672},
   archivePrefix={arXiv},
   primaryClass={cs.CL}
}

@misc{https://doi.org/10.48550/arxiv.2212.08055,
    title={UnitY: Two-pass Direct Speech-to-speech Translation with Discrete Units}, 
    author={Hirofumi Inaguma and Sravya Popuri and Ilia Kulikov and Peng-Jen Chen and Changhan Wang and Yu-An Chung and Yun Tang and Ann Lee and Shinji Watanabe and Juan Pino},
    year={2023},
    eprint={2212.08055},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}