(Newer version: Languages and Treebanks in HamleDT 3.0)
Code | TCode | Train/test | Train size | Test size | Training/TestSnt | Total size | Nonprojectivity | ||||||
split | Sentences | Tokens | Sentences | Tokens | % | % | Sentences | Tokens | Average sentence length | Deps | Percent | ||
ar | padtr349 | ours | 6776 | 249600 | 771 | 27823 | 90 | 10 | 7547 | 277423 | 36,76 | 969 | 0,35 |
bg | conll2006 | official | 12823 | 190217 | 398 | 5934 | 97 | 3 | 13221 | 196151 | 14,84 | 747 | 0,38 |
bn | icon2010 | official | 979 | 6440 | 150 | 812 | 87 | 13 | 1129 | 7252 | 6,42 | 78 | 1,08 |
ca | conll2009 | official etest | 13200 | 390302 | 1724 | 53015 | 88 | 12 | 14924 | 443317 | 29,70 | 0 | 0,00 |
cs | pdt30 | official etest | 77765 | 1330152 | 10148 | 173586 | 88 | 12 | 87913 | 1503738 | 17,10 | 28488 | 1,89 |
da | conll2006 | official | 5190 | 94386 | 322 | 5852 | 94 | 6 | 5512 | 100238 | 18,19 | 988 | 0,99 |
de | conll2009 | official etest | 36020 | 648677 | 2000 | 32033 | 95 | 5 | 38020 | 680710 | 17,90 | 15875 | 2,33 |
el | conll2007 | official | 2705 | 65419 | 197 | 4804 | 93 | 7 | 2902 | 70223 | 24,20 | 823 | 1,17 |
en | conll2007 | official | 18577 | 446573 | 214 | 5003 | 99 | 1 | 18791 | 451576 | 24,03 | 1493 | 0,33 |
es | conll2009 | official | 14329 | 427442 | 1655 | 50368 | 90 | 10 | 15984 | 477810 | 29,89 | 0 | 0,00 |
et | puudepank | ours | 1184 | 8535 | 131 | 956 | 90 | 10 | 1315 | 9491 | 7,22 | 7 | 0,07 |
eu | bdt | official | 10104 | 137309 | 1122 | 14295 | 90 | 10 | 11226 | 151604 | 13,50 | 1925 | 1,27 |
fa | perdt | ours | 12126 | 182878 | 329 | 6694 | 97 | 3 | 12455 | 189572 | 15,22 | 3357 | 1,77 |
fi | turku | ours | 3877 | 53151 | 430 | 5425 | 90 | 10 | 4307 | 58576 | 13,60 | 299 | 0,51 |
grc | agdt | ours | 20632 | 302957 | 528 | 5925 | 98 | 2 | 21160 | 308882 | 14,60 | 60469 | 19,58 |
hi | hydt05 | official | 12041 | 268093 | 1233 | 26416 | 91 | 9 | 13274 | 294509 | 22,19 | 4294 | 1,46 |
hu | conll2007 | official | 6034 | 131799 | 390 | 7344 | 94 | 6 | 6424 | 139143 | 21,66 | 4032 | 2,90 |
it | conll2007 | official | 3110 | 71199 | 249 | 5096 | 93 | 7 | 3359 | 76295 | 22,71 | 354 | 0,46 |
ja | conll2007 | official | 17044 | 151461 | 709 | 5711 | 96 | 4 | 17753 | 157172 | 8,85 | 1736 | 1,10 |
la | ldt | ours | 3157 | 48354 | 316 | 4789 | 91 | 9 | 3473 | 53143 | 15,30 | 4042 | 7,61 |
nl | conll2006 | official | 13349 | 195069 | 386 | 5585 | 97 | 3 | 13735 | 200654 | 14,61 | 10858 | 5,41 |
pt | conll2006 | official | 9071 | 206678 | 288 | 5867 | 97 | 3 | 9359 | 212545 | 22,71 | 2778 | 1,31 |
ro | rodt | ours | 3776 | 33510 | 266 | 2640 | 93 | 7 | 4042 | 36150 | 8,94 | 0 | 0,00 |
ru | syntagrus | ours | 34493 | 494007 | 402 | 3458 | 99 | 1 | 34895 | 497465 | 14,26 | 4146 | 0,83 |
sk | sta1 | ours | 51941 | 815313 | 5494 | 85985 | 90 | 10 | 57435 | 901298 | 15,69 | 15526 | 1,72 |
sl | conll2006 | official | 1534 | 28750 | 402 | 6390 | 79 | 21 | 1936 | 35140 | 18,15 | 675 | 1,92 |
sv | conll2006 | official | 11042 | 191467 | 389 | 5656 | 97 | 3 | 11431 | 197123 | 17,24 | 1928 | 0,98 |
ta | tamiltb | ours | 480 | 7592 | 120 | 1989 | 80 | 20 | 600 | 9581 | 15,97 | 15 | 0,16 |
te | icon2010 | official | 1300 | 5125 | 150 | 597 | 90 | 10 | 1450 | 5722 | 3,95 | 13 | 0,23 |
tr | conll2007 | official | 5635 | 65182 | 300 | 4513 | 95 | 5 | 5935 | 69695 | 11,74 | 3716 | 5,33 |
@inproceedings{ar, author = {Otakar Smr{\v{z}} and Viktor Bielick{\'{y}} and Iveta Kou{\v{r}}ilov{\'{a}} and Jakub Kr{\'{a}}{\v{c}}mar and Jan Haji{\v{c}} and Petr Zem{\'{a}}nek}, year = {2008}, title = {{P}rague {A}rabic Dependency Treebank: A Word on the Million Words}, booktitle = {Proceedings of the Workshop on Arabic and Local Languages ({LREC} 2008)}, publisher = {European Language Resources Association}, address = {Marrakech, Morocco}, pages = {16--23}, isbn = {2-9517408-4-0}, } @inproceedings{bg, author = {Kiril Simov and Petya Osenova}, year = 2005, title = {Extending the Annotation of {B}ul{T}ree{B}ank: Phase 2}, booktitle = {The Fourth Workshop on Treebanks and Linguistic Theories (TLT 2005)}, month = {December}, address = {Barcelona}, pages = {173--184} } @misc{cs, author = {Jan Haji{\v{c}} and Jarmila Panevov{\'{a}} and Eva Haji{\v{c}}ov{\'{a}} and Petr Sgall and Petr Pajas and Jan {\v{S}}t{\v{e}}p{\'{a}}nek and Ji{\v{r}}{\'{i}} Havelka and Marie Mikulov{\'{a}} and Zden{\v{e}}k {\v{Z}}abokrtsk{\'{y}} and Magda {\v{S}}ev{\v{c}}{\'{i}}kov{\'{a}}-Raz{\'{i}}mov{\'{a}}}, year = {2006} title = {{Prague Dependency Treebank 2.0}}, howpublished = {CD-ROM, Linguistic Data Consortium, LDC Catalog No.: LDC2006T01, Philadelphia}, publisher = {Linguistic Data Consortium}, address = {Philadelphia, {PA}, {USA}}, isbn = {1-58563-370-4}, } @misc{da, author = {Kromann, Matthias T. and Mikkelsen, Line and Stine Kern Lynge}, year = {2004} title = {Danish Dependency Treebank}, url = {http://code.google.com/p/copenhagen-dependency-treebank/}, address = {K{\o}benhavn, Denmark}, } @inproceedings{de, author = {Sabine Brants and Stefanie Dipper and Silvia Hansen and Wolfgang Lezius and George Smith}, year = {2002}, title = {The {TIGER} Treebank}, booktitle = "Proceedings of the Workshop on Treebanks and Linguistic Theories", address = {Sozopol}, postscript = "http://www.ims.uni-stuttgart.de/projekte/TIGER/paper/treeling2002.ps.gz", pdf = "http://www.ims.uni-stuttgart.de/projekte/TIGER/paper/treeling2002.pdf", keywords = {Treebank,German,TIGER} } @inproceedings{el, author = {Prokopis Prokopidis and Elina Desipri and Maria Koutsombogera and Harris Papageorgiou and Stelios Piperidis}, year = {2005}, title = {Theoretical and practical issues in the construction of a {G}reek dependency treebank}, booktitle = {In Proc. of the 4th Workshop on Treebanks and Linguistic Theories (TLT)}, pages = {149--160} } @inproceedings{en, author = {Surdeanu, Mihai and Johansson, Richard and Meyers, Adam and Màrquez, Lluís and Nivre, Joakim}, year = {2008}, title = {The {CoNLL-2008} Shared Task on Joint Parsing of Syntactic and Semantic Dependencies}, booktitle = {Proceedings of {CoNLL}} } @article{penn, author = {Marcus, Mitchell P. and Santorini, Beatrice and Marcinkiewicz, Mary Ann}, year = {1993}, title = {Building a Large Annotated Corpus of English: The Penn Treebank}, journal = {Computational Linguistics}, volume = {19}, number = {2}, pages = {313--330} } % Same reference for ca and es! @inproceedings{es, author = {Mariona Taul{\'e} and Maria Ant{\`o}nia Mart\'{\i} and Marta Recasens}, title = {{AnCora}: Multilevel Annotated Corpora for {Catalan} and {Spanish}}, booktitle = {LREC}, year = {2008}, ee = {http://www.lrec-conf.org/proceedings/lrec2008/summaries/35.html}, crossref = {DBLP:conf/lrec/2008}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{et, author = {Bick, Eckhard and Uibo, Heli and Müürisep, Kaili}, year = {2004}, title = {Arborest -- a {VISL}-Style Treebank Derived from an {E}stonian Constraint Grammar Corpus}, booktitle = {Proceedings of Treebanks and Linguistic Theories}, url = {http://beta.visl.sdu.dk/pdf/Bick_Uibo_Muurisep_TLT04.pdf} } @inproceedings{eu, author = {Aduriz, Itzair and Aranzabe, María Jesús and Arriola, Jose Mari and Atutxa, Aitziber and Díaz de Ilarraza, Arantza and Garmendia, Aitzpea and Oronoz, Maite}, year = {2003}, title = {Construction of a {Basque} dependency treebank}, booktitle = "Proceedings of the 2nd Workshop on Treebanks and Linguistic Theories", } @inproceedings{fa, author = {Rasooli, Mohammad Sadegh and Moloodi, Amirsaeid and Kouhestani, Manouchehr and Minaei-Bidgoli, Behrouz}, year = {2011}, title = {A Syntactic Valency Lexicon for {Persian} Verbs: The First Steps towards {Persian} Dependency Treebank}, booktitle = {5th Language \& Technology Conference ({LTC}): Human Language Technologies as a Challenge for Computer Science and Linguistics}, pages = {227--231}, address = {Poznań, Poland} } @inproceedings{fi, author = {Katri Haverinen and Timo Viljanen and Veronika Laippala and Samuel Kohonen and Filip Ginter and Tapio Salakoski}, title = {Treebanking {F}innish}, booktitle = {Proceedings of the Ninth International Workshop on Treebanks and Linguistic Theories (TLT9)}, year = {2010}, pages = {79--90}, url = {http://hdl.handle.net/10062/15936} } % Same reference for grc and la! @incollection {grc, author = {Bamman, David and Crane, Gregory}, year = {2011} affiliation = {Perseus Project, Tufts University, Medford/Somerville, USA}, title = {The {A}ncient {G}reek and {L}atin Dependency Treebanks}, booktitle = {Language Technology for Cultural Heritage}, series = {Theory and Applications of Natural Language Processing}, publisher = {Springer Berlin Heidelberg}, isbn = {978-3-642-20227-8}, keyword = {Computer Science}, pages = {79-98}, } @inproceedings{it, author = {Simonetta Montemagni and Francesco Barsotti and Marco Battista and Nicoletta Calzolari and Ornella Corazzari and Alessandro Lenci and Antonio Zampolli and Francesca Fanciulli and Maria Massetani and Remo Raffaelli and Roberto Basili and Maria Teresa Pazienza and Dario Saracino and Fabio Zanzotto and Nadia Mana and Fabio Pianesi and Rodolfo Delmonte}, year = 2003, title = {Building the {I}talian Syntactic-Semantic Treebank}, booktitle = {Building and using Parsed Corpora}, pages = {189--210}, editor = {Anne Abeillé}, series = {Language and Speech series}, address = {Dordrecht}, publisher = {Kluwer} } % Same reference for hi, bn and te! @inproceedings{hi, author = {Husain, Samar and Mannem, Prashanth and Ambati, Bharat and Gadde, Phani}, year = {2010}, title = {The {ICON-2010} tools contest on {Indian} language dependency parsing}, booktitle = {Proceedings of {ICON-2010} Tools Contest on {Indian} Language Dependency Parsing}, address = {Kharagpur, India} } @inproceedings{hu, author = {D{\'o}ra Csendes and J{\'a}nos Csirik and Tibor Gyim{\'o}thy and Andr{\'a}s Kocsor}, year = {2005}, title = {The {S}zeged Treebank}, booktitle = {TSD}, pages = {123-131}, ee = {http://dx.doi.org/10.1007/11551874_16}, crossref = {DBLP:conf/tsd/2005}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{ja, author = {Kawata, Yasuhiro and Bartels, Julia}, year = {2000}, title = {Stylebook for the {Japanese} Treebank in {Verbmobil}}, booktitle = {Report 240}, month = {September 29}, address = {Tübingen, Germany} } @inproceedings{nl, author = {Leonoor van der Beek and Bouma, Gosse and Daciuk, Jan and Gaustad, Tanja and Malouf, Robert and Gertjan van Noord and Prins, Robbert and Villada, Begoña}, year = {2002}, title = {Chapter 5. The {Alpino} Dependency Treebank}, booktitle = {Algorithms for Linguistic Processing {NWO PIONIER} Progress Report}, address = {Groningen, The Netherlands}, url = {http://odur.let.rug.nl/~vannoord/trees/Papers/report_ch5.pdf} } @inproceedings{pt, author = {Susana Afonso and Eckhard Bick and Renato Haber and Diana Santos}, year = {2002}, title = {{{``}Floresta sint{\'a}(c)tica{''}:} a treebank for {P}ortuguese}, booktitle = {Proceedings of the 3rd International Conference on Language Resources and Evaluation (LREC)}, pages = {1968--1703}, } @mastersthesis{ro, author = {Călăcean, Mihaela}, year = {2008}, title = {Data-driven Dependency Parsing for {R}omanian}, month = {August}, school = {Uppsala University}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.153.6068&rep=rep1&type=pdf} } @inproceedings{ru, author = {Boguslavsky, Igor and Grigorieva, Svetlana and Grigoriev, Nikolai and Kreidlin, Leonid and Frid, Nadezhda}, year = {2000}, title = {{Dependency treebank for Russian: Concept, tools, types of information}}, booktitle = {Proceedings of the 18th conference on Computational linguistics-Volume 2}, pages = {987--991}, organization={Association for Computational Linguistics Morristown, NJ, USA} } @inproceedings{sk, author = {Mária {\noopsort{Szimkova}}Šimková and Radovan Garabík}, year = {2006}, title = {Sintaksičeskaja razmetka v Slovackom nacional'nom korpuse ({\selectlanguage{russian}Синтаксическая разметка в Словацком национальном корпусе})}, booktitle = {Trudy meždunarodnoj konferencii Korpusnaja lingvistika ({\selectlanguage{russian}Tруды международной конференции Корпусная лингвистика}) – 2006}, publisher = {St. Petersburg University Press}, address = {Sankt-Peterburg, Russia}, pages = {389--394}, isbn = {5-288-04181-4} } @inproceedings{sl, author = {D\v{z}eroski, Sa\v{s}o and Erjavec, Toma\v{z} and Ledinek, Nina and Pajas, Petr and \v{Z}abokrtsk\'{y}, Zden\v{e}k and \v{Z}ele, Andreja}, year = {2006}, title = {Towards a {S}lovene Dependency Treebank}, booktitle = {Proceedings of the Fifth International Language Resources and Evaluation Conference, {LREC} 2006}, address = {Genova, Italy}, publisher = {European Language Resources Association ({ELRA})}, pages = {1388--1391}, url = {http://hnk.ffzg.hr/bibl/lrec2006/summaries/133.html} } @inproceedings{sv, author = {Nivre, Joakim and Nilsson, Jens and Hall, Johan}, year = {2006}, title = {Talbanken05: A {Swedish} Treebank with Phrase Structure and Dependency Annotation}, booktitle = {Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC} 2006)}, publisher = {European Language Resources Association ({ELRA})}, address = {Genova, Italy}, url = {http://www.msi.vxu.se/users/nivre/research/Talbanken05.html} } @inproceedings{ta, author = {Ramasamy, Loganathan and \v{Z}abokrtsk\'{y}, Zden\v{e}k}, year = {2012}, title = {Prague Dependency Style Treebank for {Tamil}}, booktitle = {Proceedings of {LREC} 2012}, address = {\.{I}stanbul, Turkey} } @inproceedings{tr, author = {Nart B. Atalay and Kemal Oflazer and Bilge Say}, year = {2003} title = {The Annotation Process in the {T}urkish Treebank}, booktitle = {In Proceedings of the 4th International Workshop on Linguistically Interpreted Corpora (LINC)}, publisher = {Association for Computational Linguistics}, address = {Budapest, Hungary}, }