@inproceedings{kirstein-hansen-etal-2023-dantok,
    title = "{D}an{T}ok: Domain Beats Language for {D}anish Social Media {POS} Tagging",
    author = {Kirstein Hansen, Kia  and
      Barrett, Maria  and
      M{\"u}ller-Eberstein, Max  and
      Damgaard, Cathrine  and
      Eriksen, Trine  and
      Goot, Rob},
    booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
    month = may,
    year = "2023",
    address = "T{\'o}rshavn, Faroe Islands",
    publisher = "University of Tartu Library",
    url = "https://aclanthology.org/2023.nodalida-1.27",
    pages = "271--279",
    abstract = "Language from social media remains challenging to process automatically, especially for non-English languages. In this work, we introduce the first NLP dataset for TikTok comments and the first Danish social media dataset with part-of-speech annotation. We further supply annotations for normalization, code-switching, and annotator uncertainty. As transferring models to such a highly specialized domain is non-trivial, we conduct an extensive study into which source data and modeling decisions most impact the performance. Surprisingly, transferring from in-domain data, even from a different language, outperforms in-language, out-of-domain training. These benefits nonetheless rely on the underlying language models having been at least partially pre-trained on data from the target language. Using our additional annotation layers, we further analyze how normalization, code-switching, and human uncertainty affect the tagging accuracy.",
}
