Package nltk_lite :: Package corpora :: Module udhr
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.udhr

  1  # Natural Language Toolkit: UDHR Corpus Reader
 
  2  #
 
  3  # Copyright (C) 2001-2007 University of Pennsylvania
 
  4  # Author: Sam Huston <shuston@students.csse.unimelb.edu.au>
 
  5  #         Steven Bird <sb@csse.unimelb.edu.au>
 
  6  # URL: <http://nltk.sf.net>
 
  7  # For license information, see LICENSE.TXT
 
  8  
 
  9  """
 
 10  Read tokens from UDHR Corpus
 
 11  
 
 12  This corpus contains examples of text in over 300 language/encoding combinations,
 
 13  from the Universal Declaration of Human Rights
 
 14  """ 
 15  
 
 16  import os 
 17  from nltk_lite import tokenize 
 18  from nltk_lite.corpora import * 
 19  
 
 20  
 
 21  items = ['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8', 'Achehnese-Latin1', 'Achuar-Shiwiar-Latin1', 'Adja-UTF8',
 
 22           'Afaan_Oromo_Oromiffa-Latin1', 'Afrikaans-Latin1', 'Aguaruna-Latin1', 'Akuapem_Twi-UTF8',
 
 23           'Albanian_Shqip-Latin1', 'Amahuaca-Latin1', 'Amahuaca', 'Amarakaeri-Latin1',
 
 24           'Amharic-Afenegus6..60375', 'Amuesha-Yanesha-UTF8', 'Arabela-Latin1', 'Arabic_Alarabia-Arabic',
 
 25           'Armenian-DallakHelv', 'Asante-UTF8', 'Ashaninca-Latin1', 'Asheninca-Latin1', 'Asturian_Bable-Latin1',
 
 26           'Aymara-Latin1', 'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
 
 27           'Azeri_Azerbaijani_Latin-Az.Times.Lat0117', 'Balinese-Latin1', 'Bambara-UTF8',
 
 28           'Baoule-UTF8', 'Basque_Euskara-Latin1', 'Batonu_Bariba-UTF8', 'Belorus_Belaruski-Cyrillic',
 
 29           'Belorus_Belaruski-UTF8', 'Bemba-Latin1', 'Bengali-UTF8', 'Beti-UTF8', 'Bhojpuri-Agra',
 
 30           'Bichelamar-Latin1', 'Bikol_Bicolano-Latin1', 'Bora-Latin1', 'Bosnian_Bosanski-Cyrillic',
 
 31           'Bosnian_Bosanski-Latin2', 'Bosnian_Bosanski-UTF8', 'Breton-Latin1', 'Bugisnese-Latin1',
 
 32           'Bulgarian_Balgarski-Cyrillic', 'Bulgarian_Balgarski-UTF8', 'Burmese_Myanmar-UTF8',
 
 33           'Burmese_Myanmar-WinResearcher', 'Cakchiquel-Latin1', 'Campa_Pajonalino-Latin1',
 
 34           'Candoshi-Shapra-Latin1', 'Caquinte-Latin1', 'Cashibo-Cacataibo-Latin1', 'Cashinahua-Latin1',
 
 35           'Catalan_Catala-Latin1', 'Catalan-Latin1', 'Cebuano-Latin1', 'Chamorro-Latin1', 'Chayahuita-Latin1',
 
 36           'Chechewa_Nyanja-Latin1', 'Chickasaw-Latin1', 'Chinanteco-Ajitlan-Latin1', 'Chinanteco-UTF8',
 
 37           'Chinese_Mandarin-GB2312', 'Chinese_Mandarin-HZ', 'Chinese_Mandarin-UTF8', 'Chuuk_Trukese-Latin1',
 
 38           'Cokwe-Latin1', 'Corsican-Latin1', 'Croatian_Hrvatski-Latin2', 'Czech_Cesky-Latin2', 'Czech_Cesky-UTF8',
 
 39           'Czech-Latin2-err', 'Czech-Latin2', 'Czech-UTF8', 'Dagaare-UTF8', 'Dagbani-UTF8', 'Dangme-UTF8',
 
 40           'Danish_Dansk-Latin1', 'Dendi-UTF8', 'Ditammari-UTF8', 'Dutch_Nederlands-Latin1', 'Edo-Latin1',
 
 41           'English-Latin1', 'Esperanto-T61', 'Esperanto-UTF8', 'Estonian_Eesti-Latin1', 'Ewe_Eve-UTF8',
 
 42           'Fante-UTF8', 'Faroese-Latin1', 'Farsi_Persian-UTF8', 'Farsi_Persian-v2-UTF8', 'Fijian-Latin1',
 
 43           'Filipino_Tagalog-Latin1', 'Finnish_Suomi-Latin1', 'Fon-UTF8', 'French_Francais-Latin1',
 
 44           'Frisian-Latin1', 'Friulian_Friulano-Latin1', 'Gagauz_Gagauzi-UTF8', 'Galician_Galego-Latin1',
 
 45           'Garifuna_Garifuna-Latin1', 'Ga-UTF8', 'German_Deutsch-Latin1', 'Gonja-UTF8', 'Greek_Ellinika-Greek',
 
 46           'Greek_Ellinika-UTF8', 'Greenlandic_Inuktikut-Latin1', 'Guarani-Latin1', 'Guen_Mina-UTF8',
 
 47           'Gujarati-UTF8', 'HaitianCreole_Kreyol-Latin1', 'HaitianCreole_Popular-Latin1', 'Hani-Latin1',
 
 48           'Hausa_Haoussa-Latin1', 'Hawaiian-UTF8', 'Hebrew_Ivrit-Hebrew', 'Hebrew_Ivrit-UTF8', 'Hiligaynon-Latin1',
 
 49           'Hindi-UFT8', 'Hindi_web-UFT8', 'Hmong_Miao_Northern-East-Guizhou-Latin1',
 
 50           'Hmong_Miao-Sichuan-Guizhou-Yunnan-Latin1', 'Hmong_Miao-SouthernEast-Guizhou-Latin1',
 
 51           'Hrvatski_Croatian-Latin2', 'Huasteco-Latin1', 'Huitoto_Murui-Latin1', 'Hungarian_Magyar-Latin1',
 
 52           'Hungarian_Magyar-Latin2', 'Hungarian_Magyar-Unicode', 'Hungarian_Magyar-UTF8', 'Ibibio_Efik-Latin1',
 
 53           'Icelandic_Yslenska-Latin1', 'Ido-Latin1', 'Igbo-UTF8', 'Iloko_Ilocano-Latin1', 'Indonesian-Latin1',
 
 54           'Interlingua-Latin1', 'Inuktikut_Greenlandic-Latin1', 'IrishGaelic_Gaeilge-Latin1', 'Italian_Italiano-Latin1',
 
 55           'Italian-Latin1', 'Japanese_Nihongo-EUC', 'Japanese_Nihongo-JIS', 'Japanese_Nihongo-SJIS',
 
 56           'Japanese_Nihongo-UTF8', 'Javanese-Latin1', 'Jola-Fogny_Diola-UTF8', 'Kabye-UTF8', 'Kannada-UTF8',
 
 57           'Kaonde-Latin1', 'Kapampangan-Latin1', 'Kasem-UTF8', 'Kazakh-Cyrillic', 'Kazakh-UTF8', 'Kiche_Quiche-Latin1',
 
 58           'Kicongo-Latin1', 'Kimbundu_Mbundu-Latin1', 'Kinyamwezi_Nyamwezi-Latin1', 'Kinyarwanda-Latin1', 'Kituba-Latin1',
 
 59           'Korean_Hankuko-UTF8', 'Kpelewo-UTF8', 'Krio-UTF8', 'Kurdish-UTF8', 'Lamnso_Lam-nso-UTF8', 'Lao-UTF8',
 
 60           'Latin_Latina-Latin1', 'Latin_Latina-v2-Latin1', 'Latvian-Latin1', 'Limba-UTF8', 'Lingala-Latin1',
 
 61           'Lithuanian_Lietuviskai-Baltic', 'Lozi-Latin1', 'Luba-Kasai_Tshiluba-Latin1', 'Luganda_Ganda-Latin1',
 
 62           'Lunda_Chokwe-lunda-Latin1', 'Luvale-Latin1', 'Luxembourgish_Letzebuergeusch-Latin1', 'Macedonian-UTF8',
 
 63           'Madurese-Latin1', 'Magahi-Agra', 'Magahi-UTF8', 'Makonde-Latin1', 'Malagasy-Latin1',
 
 64           'Malay_BahasaMelayu-Latin1', 'Maltese-UTF8', 'Mam-Latin1', 'Maninka-UTF8', 'Maori-Latin1',
 
 65           'Mapudungun_Mapuzgun-Latin1', 'Mapudungun_Mapuzgun-UTF8', 'Marathi-UTF8', 'Marshallese-Latin1',
 
 66           'Matses-Latin1', 'Mayan_Yucateco-Latin1', 'Mazahua_Jnatrjo-UTF8', 'Mazateco-Latin1', 'Mende-UTF8',
 
 67           'Mikmaq_Micmac-Mikmaq-Latin1', 'Minangkabau-Latin1', 'Miskito_Miskito-Latin1', 'Mixteco-Latin1',
 
 68           'Mongolian_Khalkha-Cyrillic', 'Mongolian_Khalkha-UTF8', 'Moore_More-UTF8', 'Nahuatl-Latin1',
 
 69           'Navaho_Dine-Navajo-Navaho-font', 'Ndebele-Latin1', 'Nepali-UTF8', 'Ngangela_Nyemba-Latin1',
 
 70           'NigerianPidginEnglish-Latin1', 'Nomatsiguenga-Latin1', 'NorthernSotho_Pedi-Sepedi-Latin1',
 
 71           'Norwegian-Latin1', 'Norwegian_Norsk-Bokmal-Latin1', 'Norwegian_Norsk-Nynorsk-Latin1', 'Nyanja_Chechewa-Latin1',
 
 72           'Nyanja_Chinyanja-Latin1', 'Nzema-UTF8', 'OccitanAuvergnat-Latin1', 'OccitanLanguedocien-Latin1',
 
 73           'Oromiffa_AfaanOromo-Latin1', 'Osetin_Ossetian-UTF8', 'Oshiwambo_Ndonga-Latin1', 'Otomi_Nahnu-Latin1',
 
 74           'Paez-Latin1', 'Palauan-Latin1', 'Peuhl-UTF8', 'Picard-Latin1', 'Pipil-Latin1', 'Polish-Latin2',
 
 75           'Polish_Polski-Latin2', 'Ponapean-Latin1', 'Portuguese_Portugues-Latin1', 'Pulaar-UTF8',
 
 76           'Punjabi_Panjabi-UTF8', 'Purhepecha-UTF8', 'Qechi_Kekchi-Latin1', 'Quechua-Latin1', 'Quichua-Latin1',
 
 77           'Rarotongan_MaoriCookIslands-Latin1', 'Rhaeto-Romance_Rumantsch-Latin1', 'Romanian-Latin2',
 
 78           'Romanian_Romana-Latin2', 'Romani-Latin1', 'Romani-UTF8', 'Rukonzo_Konjo-Latin1', 'Rundi_Kirundi-Latin1',
 
 79           'Runyankore-rukiga_Nkore-kiga-Latin1', 'Russian-Cyrillic', 'Russian_Russky-Cyrillic', 'Russian_Russky-UTF8',
 
 80           'Russian-UTF8', 'Sami_Lappish-UTF8', 'Sammarinese-Latin1', 'Samoan-Latin1', 'Sango_Sangho-Latin1',
 
 81           'Sanskrit-UTF8', 'Saraiki-UTF8', 'Sardinian-Latin1', 'ScottishGaelic_GaidhligAlbanach-Latin1',
 
 82           'Seereer-UTF8', 'Serbian_Srpski-Cyrillic', 'Serbian_Srpski-Latin2', 'Serbian_Srpski-UTF8',
 
 83           'Sharanahua-Latin1', 'Shipibo-Conibo-Latin1', 'Shona-Latin1', 'Sinhala-UTF8', 'Siswati-Latin1',
 
 84           'Slovak-Latin2', 'Slovak_Slovencina-Latin2', 'Slovenian_Slovenscina-Latin2', 'SolomonsPidgin_Pijin-Latin1',
 
 85           'Somali-Latin1', 'Soninke_Soninkanxaane-UTF8', 'Sorbian-Latin2', 'SouthernSotho_Sotho-Sesotho-Sutu-Sesutu-Latin1',
 
 86           'Spanish_Espanol-Latin1', 'Spanish-Latin1', 'Sukuma-Latin1', 'Sundanese-Latin1',
 
 87           'Sussu_Soussou-Sosso-Soso-Susu-UTF8', 'Swaheli-Latin1', 'Swahili_Kiswahili-Latin1', 'Swedish_Svenska-Latin1',
 
 88           'Tahitian-UTF8', 'Tamil-UTF8', 'Tenek_Huasteco-Latin1', 'Tetum-Latin1', 'Themne_Temne-UTF8',
 
 89           'Tigrinya_Tigrigna-VG2Main', 'Tiv-Latin1', 'Toba-UTF8', 'Tojol-abal-Latin1', 'TokPisin-Latin1',
 
 90           'Tonga-Latin1', 'Tongan_Tonga-Latin1', 'Totonaco-Latin1', 'Trukese_Chuuk-Latin1', 'Turkish_Turkce-Turkish',
 
 91           'Turkish_Turkce-UTF8', 'Tzeltal-Latin1', 'Tzotzil-Latin1', 'Uighur_Uyghur-Latin1', 'Uighur_Uyghur-UTF8',
 
 92           'Ukrainian-Cyrillic', 'Ukrainian-UTF8', 'Umbundu-Latin1', 'Urarina-Latin1', 'Uzbek-Latin1',
 
 93           'Vietnamese-ALRN-UTF8', 'Vietnamese-TCVN', 'Vietnamese-UTF8', 'Vietnamese-VIQR', 'Vietnamese-VPS',
 
 94           'Vlach-Latin1', 'Walloon_Wallon-Latin1', 'Wama-UTF8', 'Waray-Latin1', 'Wayuu-Latin1', 'Welsh_Cymraeg-Latin1',
 
 95           'WesternSotho_Tswana-Setswana-Latin1', 'Wolof-Latin1', 'Xhosa-Latin1', 'Yagua-Latin1', 'Yao-Latin1',
 
 96           'Yapese-Latin1', 'Yoruba-UTF8', 'Zapoteco-Latin1', 'Zapoteco-SanLucasQuiavini-Latin1', 'Zhuang-Latin1',
 
 97           'Zulu-Latin1'] 
 98  
 
 99  item_name = {} 
100  
 
101 -def raw(files = 'English-Latin1'):
102 if type(files) is str: files = (files,) 103 104 for file in files: 105 path = os.path.join(get_basedir(), "udhr", file) 106 s = open(path).read() 107 for t in tokenize.whitespace(s): 108 yield t
109
110 -def langs(files = items):
111 langs = {} 112 if type(files) is str: files = (files,) 113 for file in files: 114 s = list(raw(file)) 115 langs[file] = s 116 return langs
117
118 -def demo():
119 from nltk_lite.corpora import udhr 120 from itertools import islice 121 122 print "English-Latin1" 123 for word in islice(udhr.raw('English-Latin1'), 27): 124 print word, 125 print 126 127 print "Italian-Latin1" 128 for word in islice(udhr.raw('Italian-Latin1'), 27): 129 print word, 130 print 131 132 print "English-Latin1, Italian-Latin1" 133 data = udhr.langs(files = ('English-Latin1', 'Italian-Latin1')) 134 135 print data["English-Latin1"] 136 print data["Italian-Latin1"]
137 138 if __name__ == '__main__': 139 demo() 140