#!/usr/bin/env python3
#PYTHONIOENCODING='utf-8'
#PYTHONLEGACYWINDOWSSTDIO='utf-8'
# -*- coding: utf-8 -*-
import requests
import sys
# client_id = "u-1ca29e75-8438-4878-ab63-49d31ed5442c"
# response = requests.get('https://www.letsmt.eu/ws/service.svc/json/GetSystemList',
# headers={'Content-Type': 'application/json',
# 'client-id': client_id},
# json={'appID': 'Tilde|EU Presidency|Web',
# 'uiLanguageID': 'en',
# 'options': ''})
# try:
# response.raise_for_status()
# except requests.HTTPError as e:
# print(e.response.status_code)
# print(e.response.content)
# systems = response.json()['System']
# for system in systems:
# print("System for {}-{}: '{}'".format(system['SourceLanguage']['Code'],
# system['TargetLanguage']['Code'],
# system['Title']['Text']))
# print("ID: {}".format(system['ID']))
# print()
# client_id = "u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea"
# client_id = "u-bd13faca-b816-4085-95d5-05373d695ab7"
# client_id = "u-1ca29e75-8438-4878-ab63-49d31ed5442c" # from the official site
client_id = "u-1ca29e75-8438-4878-ab63-49d31ed5442c"
system_id= "smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf" # en-ro
text = sys.argv[1]
response = requests.post('https://www.letsmt.eu/ws/service.svc/json/TranslateEx',
headers={'Content-Type': 'application/json',
'client-id': client_id},
json={'appID': 'Tilde|EU Presidency|Web',
'systemID': system_id,
'text': text,
'options': 'alignment,markSentences'})
try:
response.raise_for_status()
except requests.HTTPError as e:
print(e.response.status_code)
print(e.response.content)
# print(response.json())
translation = response.json()
print(str(translation['translation']).encode('utf-8').decode('utf-8'))
import requests
import json
import sys
import urllib.parse
sourcetext = sys.argv[1]
# print(sourcetext)
sourcetexturl = urllib.parse.quote_plus(sourcetext)
# print(sourcetexturl)
# response = requests.options(
# 'https://letsmt.eu/ws/service.svc/json/TranslateEx',
# headers={'Host': 'letsmt.eu', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Access-Control-Request-Method': 'POST', 'Access-Control-Request-Headers': 'client-id,content-type', 'Referer': 'https://translate2018.eu/', 'Origin': 'https://translate2018.eu', 'DNT': '1', 'Connection': 'keep-alive', 'TE': 'Trailers'},
# )
# response = requests.post(
# 'https://letsmt.eu/ws/service.svc/json/TranslateEx',
# headers={'Host': 'letsmt.eu', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://translate2018.eu/', 'Content-Type': 'application/json', 'client-id': 'u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea', 'Content-Length': '310', 'Origin': 'https://translate2018.eu', 'DNT': '1', 'Connection': 'keep-alive', 'TE': 'Trailers'},
# data='{"appID":"Tilde|EU Presidency|Web","text":"401 - Unauthorized: Access is denied due to invalid credentials. You do not have permission to view this directory or page using the credentials that you supplied.","systemID":"smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf","options":"widget=text,alignment,markSentences"}',
# )
# response = requests.post('https://letsmt.eu/ws/service.svc/json/TranslateEx', headers=headers, data=data)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
# 'Referer': 'https://translate2018.eu/',
'Content-Type': 'application/json; charset=utf-8',
'client-id': 'u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea',
# 'Origin': 'https://translate2018.eu',
'DNT': '1',
'Connection': 'keep-alive',
'TE': 'Trailers',
}
# data = '{"appID":"Tilde|EU Presidency|Web","text":"401 - Unauthorized: Access is denied due to invalid credentials. You do not have permission to view this directory or page using the credentials that you supplied.","smt-99b2f71a-1b3b-418e-bd6b-125f61a53feb","options":"widget=text,alignment,markSentences"}'
data = '{"appID":"Tilde|EU Presidency|Web","text":"' + str(sourcetext).encode('utf-8').decode('utf-8') + '","systemID":"smt-160de000-f719-4d5b-9daa-34859345e889","options":"widget=text,alignment,markSentences"}'
# smt-160de000-f719-4d5b-9daa-34859345e889 de-en
# smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf en-ro
# smt-99b2f71a-1b3b-418e-bd6b-125f61a53feb en-de
# smt-693519e3-465c-460f-807b-3ad4736ce6b8 ro-en
response = requests.post('https://letsmt.eu/ws/service.svc/json/TranslateEx', headers=headers, data=data)
print(data)
# print(response)
print(response.text)
translation = response.json()
# print(str(translation['translation']).encode('utf-8').decode('utf-8'))
# sys.stdout.buffer.write(str(translation['translation']).encode('utf-8'))
# jsontext = json.loads(response.content)
# print(jsontext)
# curl "https://letsmt.eu/ws/service.svc/json/TranslateEx" -X OPTIONS -H "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0" -H "Accept: */*" -H "Accept-Language: en-US,en;q=0.5" --compressed -H "Access-Control-Request-Method: POST" -H "Access-Control-Request-Headers: client-id,content-type" -H "Referer: https://translate2018.eu/?lang=en" -H "Origin: https://translate2018.eu" -H "DNT: 1" -H "Connection: keep-alive"
# curl "https://letsmt.eu/ws/service.svc/json/TranslateEx" -H "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0" -H "Accept: */*" -H "Accept-Language: en-US,en;q=0.5" --compressed -H "Referer: https://translate2018.eu/?lang=en" -H "Content-Type: application/json" -H "client-id: u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea" -H "Origin: https://translate2018.eu" -H "DNT: 1" -H "Connection: keep-alive" -H "TE: Trailers" --data "{""appID"":""Tilde|EU Presidency|Web"",""text"":""401 - Unauthorized: Access is denied due to invalid credentials. You do not have permission to view this directory or page using the credentials that you supplied."",""systemID"":""smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf"",""options"":""widget=text,alignment,markSentences""}"
# https://www.letsmt.eu/ws/service.svc/json/TranslateArrayEx?appID="Tilde|EU Presidency|Web"&systemID=smt-160de000-f719-4d5b-9daa-34859345e889&textArray=[Katze]&client-id=u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea
# $publicAppid = 'wikiapp',
# $apiUrl = 'https://letsmt.eu/ws',
# $webIframeUrl = 'https://readymt.tilde.com',
# $currentKey = $publicAppid + '-u-918f738b-7413-405d-acda-577ac8825db2'; // live;
#!/usr/bin/env python3
#PYTHONIOENCODING='utf-8'
#PYTHONLEGACYWINDOWSSTDIO='utf-8'
# -*- coding: utf-8 -*-
import requests
import json
import sys
import urllib.parse
sourcetext = sys.argv[1]
# print(sourcetext)
sourcetexturl = urllib.parse.quote_plus(sourcetext)
# print(sourcetexturl)
# response = requests.options(
# 'https://letsmt.eu/ws/service.svc/json/TranslateEx',
# headers={'Host': 'letsmt.eu', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Access-Control-Request-Method': 'POST', 'Access-Control-Request-Headers': 'client-id,content-type', 'Referer': 'https://translate2018.eu/', 'Origin': 'https://translate2018.eu', 'DNT': '1', 'Connection': 'keep-alive', 'TE': 'Trailers'},
# )
# response = requests.post(
# 'https://letsmt.eu/ws/service.svc/json/TranslateEx',
# headers={'Host': 'letsmt.eu', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://translate2018.eu/', 'Content-Type': 'application/json', 'client-id': 'u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea', 'Content-Length': '310', 'Origin': 'https://translate2018.eu', 'DNT': '1', 'Connection': 'keep-alive', 'TE': 'Trailers'},
# data='{"appID":"Tilde|EU Presidency|Web","text":"401 - Unauthorized: Access is denied due to invalid credentials. You do not have permission to view this directory or page using the credentials that you supplied.","systemID":"smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf","options":"widget=text,alignment,markSentences"}',
# )
# response = requests.post('https://letsmt.eu/ws/service.svc/json/TranslateEx', headers=headers, data=data)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
# 'Referer': 'https://translate2018.eu/',
'Content-Type': 'application/json; charset=utf-8',
'client-id': 'u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea',
# 'Origin': 'https://translate2018.eu',
'DNT': '1',
'Connection': 'keep-alive',
'TE': 'Trailers',
}
# data = '{"appID":"Tilde|EU Presidency|Web","text":"401 - Unauthorized: Access is denied due to invalid credentials. You do not have permission to view this directory or page using the credentials that you supplied.","smt-99b2f71a-1b3b-418e-bd6b-125f61a53feb","options":"widget=text,alignment,markSentences"}'
data = '{"appID":"Tilde|EU Presidency|Web","text":"' + str(sourcetext).encode('utf-8').decode('utf-8') + '","systemID":"smt-160de000-f719-4d5b-9daa-34859345e889","options":"widget=text,alignment,markSentences"}'
# smt-160de000-f719-4d5b-9daa-34859345e889 de-en
# smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf en-ro
# smt-99b2f71a-1b3b-418e-bd6b-125f61a53feb en-de
# smt-693519e3-465c-460f-807b-3ad4736ce6b8 ro-en
response = requests.post('https://letsmt.eu/ws/service.svc/json/TranslateEx', headers=headers, data=data)
print(data)
# print(response)
print(response.text)
translation = response.json()
# print(str(translation['translation']).encode('utf-8').decode('utf-8'))
# sys.stdout.buffer.write(str(translation['translation']).encode('utf-8'))
# jsontext = json.loads(response.content)
# print(jsontext)
# curl "https://letsmt.eu/ws/service.svc/json/TranslateEx" -X OPTIONS -H "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0" -H "Accept: */*" -H "Accept-Language: en-US,en;q=0.5" --compressed -H "Access-Control-Request-Method: POST" -H "Access-Control-Request-Headers: client-id,content-type" -H "Referer: https://translate2018.eu/?lang=en" -H "Origin: https://translate2018.eu" -H "DNT: 1" -H "Connection: keep-alive"
# curl "https://letsmt.eu/ws/service.svc/json/TranslateEx" -H "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0" -H "Accept: */*" -H "Accept-Language: en-US,en;q=0.5" --compressed -H "Referer: https://translate2018.eu/?lang=en" -H "Content-Type: application/json" -H "client-id: u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea" -H "Origin: https://translate2018.eu" -H "DNT: 1" -H "Connection: keep-alive" -H "TE: Trailers" --data "{""appID"":""Tilde|EU Presidency|Web"",""text"":""401 - Unauthorized: Access is denied due to invalid credentials. You do not have permission to view this directory or page using the credentials that you supplied."",""systemID"":""smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf"",""options"":""widget=text,alignment,markSentences""}"
# https://www.letsmt.eu/ws/service.svc/json/TranslateArrayEx?appID="Tilde|EU Presidency|Web"&systemID=smt-160de000-f719-4d5b-9daa-34859345e889&textArray=[Katze]&client-id=u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea
# $publicAppid = 'wikiapp',
# $apiUrl = 'https://letsmt.eu/ws',
# $webIframeUrl = 'https://readymt.tilde.com',
# $currentKey = $publicAppid + '-u-918f738b-7413-405d-acda-577ac8825db2'; // live;
'''
import requests
headers = {
'authority': 'www.letsmt.eu',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'data-type': 'json',
'client-id': 'u-5d4e301e-cddc-4f21-a350-0c3e5d2bee37',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Mobile Safari/537.36',
'content-type': 'application/json',
'accept': '*/*',
'origin': 'https://www.presidencymt.eu',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.presidencymt.eu/',
'accept-language': 'en-US,en;q=0.9,de;q=0.8,ro;q=0.7',
}
data = '{"appID":"Tilde|EU Presidency|Web","options":"widget=text,alignment,markSentences","systemID":"smt-e-transl-de-ro","text":"Das ist gut"}'
response = requests.post('https://www.letsmt.eu/ws/service.svc/json/TranslateEx', headers=headers, data=data)
print(response.text)
'''
'''
import requests
headers = {
'authority': 'www.letsmt.eu',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'data-type': 'json',
'client-id': 'u-5d4e301e-cddc-4f21-a350-0c3e5d2bee37',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Mobile Safari/537.36',
'content-type': 'application/json',
'accept': '*/*',
'origin': 'https://www.presidencymt.eu',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.presidencymt.eu/',
'accept-language': 'en-US,en;q=0.9,de;q=0.8,ro;q=0.7',
}
data = '{"appID":"Tilde|EU Presidency|Web","options":"widget=text,alignment,markSentences","systemID":"smt-e-transl-de-ro","text":"Das ist gut"}'
response = requests.post('https://www.letsmt.eu/ws/service.svc/json/TranslateEx', headers=headers, data=data)
print(response.text)
python3 get-system-list.py u-1ca29e75-8438-4878-ab63-49d31ed5442c
python translate-text.py u-1ca29e75-8438-4878-ab63-49d31ed5442c smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf "<div>This is an <b>example</b> of a translation request <img src=\"http://letsmt.eu/images/tilde.svg\" /> with formatting tags.</div>"
Client ID
u-1ca29e75-8438-4878-ab63-49d31ed5442c
u-5d4e301e-cddc-4f21-a350-0c3e5d2bee37
Austria 2018 Tilde
u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0ea
Estonia 2017
u-dc4cd3c5-ebc9-4213-ac9d-593c896bc0e
System for fr-lt: 'French - Lithuanian (NMT) LT-MT3'
ID: smt-a1d5726f-4356-4e0c-bc04-42dd8d117722
System for fi-sv: 'Finnish - Swedish (NMT)'
ID: smt-c522efc0-2493-4063-9feb-7cd1610276e9
System for lt-ru: 'Seimas - Lt-Ru - NMT'
ID: smt-8d6f52a3-7f5a-4cca-a664-da222afe18b5
System for nl-fr: 'FREME Dutch-French Legal - v1.1'
ID: smt-63f939f0-7ebf-4b45-978a-4fa4714c601b
System for en-sv: 'NLG English-Swedish SMT System'
ID: smt-51ae56f7-cc70-4486-b854-6a85ee57b9a6
System for en-fr: 'EN-FR (Canadian) TRSB Domain System - NMT'
ID: smt-0fd9e73e-2e1d-4be4-b09f-8f304521285d
System for fr-en: '_TMP to get fr-en probability dictionary'
ID: smt-68e08cf1-7b27-4afb-bab4-941f74392eab
System for pl-lt: 'PL-LT -- general -- v3 (incl. 1M MS UI Strings 2012)'
ID: smt-98017a8a-3432-4533-80a5-df035aa8822f
System for en-lv: 'Medicine EN-LV'
ID: smt-f3689d1e-502c-4065-b053-5988d2fb9213
System for en-lv: 'English - Latvian IT (for ORACLE experiment)'
ID: smt-b33794e2-a109-43fd-8075-4d51aa12b771
System for en-pl: '[INTERTEXT] English-Polish Legal & Finance NMT'
ID: smt-ce57328e-070c-4c58-8d26-37f7493fd2e8
System for lv-en: 'MNKC mono data fetch'
ID: smt-9bf820e1-342b-45b4-b1c0-a6b62ce39313
System for en-de: 'English - German Legal V2.0'
ID: smt-c71f6f22-5e2a-4682-a4cc-2d97d38fac5e
System for lv-ru: 'ERAF-MT LV-RU (valsts pārvalde) v0.4'
ID: smt-2120323c-dc13-4b99-92fc-def2aa620228
System for en-lv: 'test en-lv fetch'
ID: smt-cb95eb1e-5a62-41ec-9369-2c5162835634
System for et-en: '[EE EU Presidency 2017] ET-EN - CyberSecurity v2'
ID: smt-3c71368a-c334-44c4-a411-34bd6c3769ee
System for en-lv: 'EESC corpus test. DCEP + EESC'
ID: smt-528fc619-ea0f-4a4b-bd55-8231d3f244ad
System for es-en: 'Spanish - English (NMT) Lynx'
ID: smt-4eafabb9-7cd6-4ae6-9dd6-6b7cc68925bb
System for en-lv: 'English - Latvian Automotive (VOLVO+Jaguar+others)'
ID: smt-6e22d957-a2d7-4c27-97cd-f370eb99b092
System for en-pl: 'GET IT - En-Pl - NMT'
ID: smt-bf087dc6-9230-44e3-aafa-4e541e92732c
System for ru-lt: 'Ru-Lt (General) NMT'
ID: smt-762e0ddf-e034-4922-85eb-edfa2f1cd8e2
System for lt-fr: 'Lithuanian - French (NMT) Legal LT-MT3'
ID: smt-d8313a9e-34d1-440a-a485-196b6ebe9527
System for en-es: '[Printful] English-Spanish NMT System'
ID: smt-09772173-4c66-4718-8bb9-e711cc9b71b5
System for en-lt: 'English - Lithuanian IT (v6.0.2) (Oracle)'
ID: smt-28477162-8e79-487f-80f3-ef8408fe5252
System for sv-fi: 'Swedish - Finnish (NMT)'
ID: smt-d14cff5a-5463-47f1-80c9-fe22e1bddd95
System for en-lv: 'Tilde Localization. English-Latvian IT '
ID: smt-76c4626c-3234-4ee0-b424-bef0a63dedc7
System for fi-en: 'Finnish - English (NMT) V1 Stockmann'
ID: smt-396b4810-0a06-4644-8e10-f0a8e1d764c0
System for en-et: 'English - Estonian IT with Dynamic Learning Disabled'
ID: smt-73931d5d-bda5-4bea-ba61-4ec3b79794e1
System for en-et: 'English-Estonian Sockeye NMT System (WMT 2018)'
ID: smt-6fcd5014-55fb-4fcb-a508-0de6408ab67c
System for en-lt: 'TB English - Lithuanian (General) v4 - Copy for LT Mono Corpora'
ID: smt-2768d4d6-01fa-4aa8-9fde-db86724288d9
System for de-lv: 'German - Latvian (NMT)'
ID: smt-bbcc3598-1999-4805-8df4-64edd0d80fdb
System for en-lt: 'Oracle'
ID: smt-7301b611-ffc5-4665-b8a8-dbabd7a69ac2
System for pl-en: 'GET IT Pl->En dummy system'
ID: smt-889b2453-0d7b-491e-a020-14baf606cae3
System for en-lv: 'Tilde Corpora Test. Microsoft 2014'
ID: smt-fd862419-05d7-4230-97ee-ff7511c19c0a
System for en-es: 'LYNX EN-ES dummy'
ID: smt-faf4ea3f-6c55-4ec8-8e3b-b1b2e5ca771c
System for lv-ru: 'ERAF-MT LV-RU (vispārējā) v0.4'
ID: smt-04969987-ccf0-4b71-9950-dd397439b39a
System for lv-et: 'Latvian-Estonian baseline'
ID: smt-ddb22d10-ba59-400e-8703-6b3e5908dfbe
System for et-en: 'Estonian - English (General) v2.1 - Copy 2'
ID: smt-1f5cef06-9981-4a41-ad10-f96e46f993b7
System for ru-lv: 'ERAF-MT2 • Mono TIESLIETU jomas korpusi'
ID: smt-ed98ec5a-86e9-45df-a50b-ae8f5f80853e
System for en-lv: 'English - Latvian IT (EN-LV v5.1)'
ID: smt-10562c8e-89ae-45ff-875f-7b5cf57be86b
System for sl-en: 'Slovenian - English Finance GORR (NMT)'
ID: smt-91719894-1ca3-4867-b6c4-4f6aad4d3398
System for en-pl: 'GET IT data fetch-clean'
ID: smt-c5a8d8e1-6556-4b42-b291-77366608c04e
System for en-ru: 'Hugo.lv UI translator (EN-RU)'
ID: smt-6d4fe537-71cc-4434-be44-9afd48567d6d
System for en-pl: 'English - Polish Patents (NMT) v2'
ID: smt-9d2ba06f-4f3c-48bb-aaef-5d93d8fe9624
System for lt-en: 'Lithuanian - English (NMT) Legal LT-MT3'
ID: smt-314ac4fb-a349-45b7-8784-2b6e23601514
System for lt-de: 'LT-MT3 LT-De Data Fetch'
ID: smt-0c8e1ff1-732d-4dab-88ea-95a2e66de98a
System for en-fr: 'EN-FR (Canadian) TRSB Broader Domain System'
ID: smt-bc8e6f7c-c973-4ce1-b223-c5c338b152d1
System for en-lv: 'TB2015 EN-LV v02.2 - more data - data loc'
ID: smt-3dab00d6-709d-479c-bf00-8d4486b4fb35
System for en-lt: 'English - Lithuanian v5.0 (automotive)'
ID: smt-712f6361-7ae2-4508-99a1-8255ac46f66f
System for en-et: 'English - Estonian IT (v5.1)'
ID: smt-ebb12e2e-7b6e-49c8-91f8-8d1bf1878019
System for en-lv: 'English - Latvian IT (EN-LV v5.8) without eval&dev filter'
ID: smt-fefb0423-b090-4b9f-a1e8-72f4de32c9eb
System for en-de: '[allintranslations.com] EN-DE data fetch'
ID: smt-8b98760f-d026-4ffc-b5bd-e53c461dff50
System for nl-de: 'FREME Dutch-German Legal - v1.1'
ID: smt-bb73bc07-7516-4d61-b6ee-cccfe1f8eab9
System for lt-de: 'Lithuanian - German (NMT) v0.2'
ID: smt-4a15e40b-042d-4cbb-bab6-36a8f70f4575
System for lt-en: 'Lithuanian - English IT (v1.1)'
ID: smt-7f5cbf7a-4b5e-4822-b37a-e2b6d62ab9e3
System for de-et: 'Danpower - DE-ET - for Data Fetch'
ID: smt-ee0b7844-28c9-46d6-b623-863a978353de
System for lv-ru: 'LV-RU system for corpora fetch to build data for Probability dictionaries'
ID: smt-9b2059d6-8970-44d7-a302-7f7c5d3a161a
System for en-lv: 'EESC corpus test baseline. JRC Acquis + DCEP + DGT + Europarl only'
ID: smt-2ce7a40b-7429-449b-99bb-5d6ce3d53a28
System for en-sv: 'NLG English-Swedish Adapted NMT System'
ID: smt-5debdeb0-8bd9-41b7-a3d1-b1a64b2f17e1
System for en-lv: 'TB English - Latvian (v03)'
ID: smt-e3080087-866f-498b-977d-63ea391ba61e
System for en-lv: 'English - Latvian IT (EN-LV v5.8)'
ID: smt-4a3835e0-cd63-46bc-8a44-65493e1b69a1
System for en-lv: 'English - Latvian IT (v4)'
ID: smt-89ce23df-462c-4910-a5fe-5b5de144caad
System for en-lt: 'EN-LT (General) NMT'
ID: smt-479476a6-64a0-4eba-ba1a-2918e2296d9e
System for en-lv: 'Tilde Localization. English-Latvian IT - Part 2 (Microsoft corpora only)'
ID: smt-4123bf84-e5e2-477c-a0ad-2e62edc88f4a
System for en-lv: 'IT'
ID: smt-d39ab8f5-43ff-44f7-a03a-3b0e6a843bc4
System for lt-pl: 'LT-PL -- general -- v3 (incl 1M MS UI Strings 2012)'
ID: smt-7435d639-cf9f-44b3-9796-90b8d842818a
System for de-en: 'DE-EN Legal System Baseline - With Dynamic Learning'
ID: smt-fe2c95a4-c387-47c5-bf4f-fb446d48f4aa
System for de-en: 'mono-translated.en-de - corpus clean'
ID: smt-27303ac3-83fc-4161-8471-97546c001961
System for en-lv: 'EESC corpus test. DGT + EESC'
ID: smt-22ef75c1-b4bc-4127-916c-dd44d099e09d
System for lt-en: 'EN IT Mono corpora fetch (LT-MT3, to build EN IT Language Model) - Copy'
ID: smt-8d36322a-12db-4341-9c72-01fb1d7dad35
System for et-en: '[EE EU Presidency 2017] ET-EN - CyberSecurity'
ID: smt-9a579b70-b5e2-4a22-a0dc-43eef3fbd78b
System for nl-en: '[NL-EN] Test'
ID: smt-36a40806-cbb7-4f87-a8b8-712307e1be74
System for en-lv: 'MNKC EN-LV SMT'
ID: smt-32e567f7-6f39-4297-a00c-0434dea12807
System for lv-en: 'ERAF-MT2 • Mono TIESLIETU jomas korpusi'
ID: smt-fc1e62d2-257b-4a4a-ae23-865643144d11
System for en-et: 'IT'
ID: smt-73de35cc-6bee-494a-a681-15c87299956c
System for en-da: 'English - Danish -- data fetch -- for dsb.dk'
ID: smt-59b723c1-3283-4da6-a245-bca5c14096b2
System for de-en: 'DE-EN Adapted NMT System for Kothes'
ID: smt-acccba78-db2b-44d5-9478-c42ffb888e4e
System for lt-en: 'IADAATPA LT-EN'
ID: smt-24a5eff7-5784-4a60-900c-2f90b093e424
System for de-sk: 'German - Slovak (NMT) Aspena'
ID: smt-968eb397-281c-415f-a2c3-cb3d6f6e1469
System for en-lv: 'TB2016 EN-LV v0.2 (more data)'
ID: smt-7c5e5db9-650b-48d6-b148-fefb2fddb599
System for ru-et: 'General domain system'
ID: smt-778032a7-4783-4d77-9cfb-b17a2bebb3af
System for pl-en: 'Polish - English Clinical trials (NMT) Kontekst'
ID: smt-1f04c12a-50de-470b-80ee-0a21fa2ab244
System for en-da: 'English - Danish (NMT) AGA'
ID: smt-a3152f7c-8256-4dea-91a5-b93ecf8e491e
System for en-lv: 'Angļu - Latviešu (Mežu nozares)'
ID: smt-54e11922-eee7-4f43-9120-bc965852885b
System for en-et: 'IT system'
ID: smt-27527309-10d6-4c7d-885d-3843dd4b5081
System for en-lt: 'English - Lithuanian IT (v4)'
ID: smt-8ce0396a-b4fb-4a2f-9e0d-ca8dd129e77e
System for da-en: 'Yet another DA-EN baseline'
ID: smt-1c30eee6-1d60-481f-8d3e-6aa3d25d1d33
System for lt-pl: 'Lithuanian - Polish (NMT) LT-MT3'
ID: smt-fc794ab3-c8af-4e29-b460-4d93adb74ce5
System for de-en: 'Austrian presidency - De-En - non-tuned NMT'
ID: smt-cf4845a7-77dc-4e9b-ab19-8452c282e55b
System for de-en: 'DE-EN Legal System Baseline'
ID: smt-8a8d0a06-c679-409e-9c24-6ab02223e4eb
System for en-lt: 'English - Lithuanian IT (Philips)'
ID: smt-cde5d5be-e6b6-42d0-999b-c2a25450e3b9
System for en-lt: 'English - Lithuanian v5.1 (automotive)'
ID: smt-89522e28-8a63-40a7-bbb0-541b46839f4c
System for de-lv: 'German-English Test'
ID: smt-8b9f24c5-7fae-4e24-be5c-0be23b2385ee
System for en-lt: 'Tilde Localization. English-Lithuanian IT '
ID: smt-2c6984d7-a2b6-42b1-a977-9c44afc33650
System for en-lt: 'English - Lithuanian (General) v5'
ID: smt-b69e08e1-33cd-48f5-96ec-8f589b40ffd6
System for lt-ru: 'Lithuanian - Russian (NMT) LT-MT3'
ID: smt-0d55ef71-81ef-40fc-8192-c43776ff1145
System for lt-de: 'Lithuanian - German (NMT) LT-MT3'
ID: smt-2803fa35-6c0d-45a7-ac18-fb9618d7cac3
System for ro-en: 'Romanian - English (NMT) Presidency'
ID: smt-693519e3-465c-460f-807b-3ad4736ce6b8
System for en-lt: 'LT Seimas korpusi pietrenēšanai - data fetch'
ID: smt-e1de6ffe-63b2-465a-8c17-6a3c9ec10223
System for en-lv: 'English - Latvian IT (EN-LV v5.8) without eval&dev filter [Dynamic]'
ID: smt-99d3a40a-6804-414d-bfdf-39996465a812
System for et-ru: 'Estonian - Russian (General) v0.1'
ID: smt-d8131189-a7e7-47bb-b31e-f2917251f774
System for en-et: 'EU Presidency NMT system (EN-ET)'
ID: smt-f313a5e6-f532-47f4-aa8c-5a963e933a0b
System for en-ja: 'English - Japanese V3 (NMT) TLS Translations'
ID: smt-009d007d-a2ca-4932-968a-8164e0c8380b
System for en-es: '[Chess] English-Spanish NMT System'
ID: smt-1fe03594-30b3-4316-b472-e60fb4298834
System for en-pl: 'FACTSET EN-PL'
ID: smt-c8e3aa16-3c2a-4d5c-a805-917dc79dd021
System for en-lv: 'ERAF-MT2 • Mono LV KULTŪRAS jomas korpusi'
ID: smt-2ba3abd8-9fb8-4e55-a5e0-be25fc0faa21
System for en-sl: 'Amidas En-Sl - Only In-domain data'
ID: smt-29b1365a-db28-43d0-9065-94c89a221a71
System for lt-pl: '[LT-MT3] Systems to build data for Probability dictionaries'
ID: smt-87f3d811-8ff3-4f1a-ac27-91e0b651bcc1
System for en-lv: 'ODINE - EMA test. EMA only'
ID: smt-124914f5-bb6f-4ba9-a098-ebac410f00e7
System for lv-en: 'TB2016 LV-EN v01'
ID: smt-08ff5e72-e016-4763-9dfc-7ad5362945cd
System for fr-lv: 'FR-LV Lettonie - Francija'
ID: smt-6020be88-49b0-4260-ad9d-4b50ef4e564e
System for en-et: 'Celsius EN-ET-NMT'
ID: smt-fff0096b-9a37-4b67-baa2-2f8317ab392e
System for ru-et: 'Russian - Estonian (General) v0.1'
ID: smt-1a98cbd0-8e28-4388-a916-05d5883410a4
System for en-lv: 'English - Latvian IT (EN-LV v5.8) with eval&dev filter'
ID: smt-1a589c0d-b56f-4e29-a8c8-f0bc695d0620
System for lv-en: 'Rail Baltica - Lv-En - NMT'
ID: smt-2ac18dd7-044b-46f3-adf1-97eefa521c9c
System for en-fr: 'Open Data demo system (EN-FR)'
ID: smt-cfd2a84b-ddb3-4431-a5bb-dfa578544974
System for en-sl: 'Amidas - En-Sl - NMT'
ID: smt-0da5d07f-c557-42fb-8f26-b0473b38fe2f
System for en-lt: 'English - Lithuanian (NMT) Legal LT-MT3'
ID: smt-c7d62941-60d6-41a6-aa2f-16d2d8d815d4
System for en-lv: 'ERAF-MT EN-LV (vispārējas jomas) v0.4'
ID: smt-1c08a5bb-95e8-4806-9a7f-3a9ad2114eca
System for en-pl: 'Locworld En-Pl - NMT'
ID: smt-9979a8e9-2428-49ed-b627-c95337533ab2
System for de-en: 'Austrian presidency back-translated De to mono En corpus-clean'
ID: smt-0a61bedc-89f1-4c8b-8463-bab6dc73d6f1
System for en-lv: 'EESC corpus test. Europarl + EESC'
ID: smt-f5fe7a74-4d1b-4b8f-86a1-e536cf63938d
System for en-lv: 'TB2016 EN-LV v0.3'
ID: smt-01214522-5f46-440f-984d-c8ad3bb01baa
System for et-en: '[EE EU Presidency 2017] ET-EN - General Domain v2'
ID: smt-7cce8647-8aa0-40d8-b1b6-77295c0b23bb
System for en-lt: 'WMT19 EN-LT SMT Baseline '
ID: smt-a08583ae-454c-4d76-80f9-f86b0c5351e2
System for lv-en: 'TB Latvian - English NMT'
ID: smt-46b9633d-48d9-4dd7-b096-2516c383a715
System for lv-en: 'LV-EN data from likumi.lv '
ID: smt-5d9f1d2c-f56b-43da-95ab-db26e4f876ab
System for lv-en: 'Latviešu - Angļu (Vispārēja)'
ID: smt-06bd7f86-792f-4d47-a260-b3857439fc1e
System for en-lv: 'EN-LV-Marian-NMT - Updated Files'
ID: smt-16d2a887-317f-4ef4-976b-90bd8c5e1a46
System for en-lv: 'English - Latvian IT (EN-LV v5.8) with eval&dev filter [Dynamic]'
ID: smt-d3c6b4f8-b850-4970-83a7-cad8415a6d31
System for en-lt: '[LT-MT3] Systems to build data for Probability dictionaries'
ID: smt-85320d0e-071c-4845-8859-e99563363837
System for en-ja: 'English-Japanese New data fetch'
ID: smt-f278417e-3f4f-4c3d-9d0b-61f600c911f5
System for lv-ru: 'LV-RU Test'
ID: smt-63e7d12b-fce0-4091-baa2-5aa524cc9e00
System for en-lv: 'EN-LV for Corpus Fetch - Copy'
ID: smt-00682b26-7fa3-483f-a083-6880bf4cf419
System for en-fr: 'EN-FR (Canadian) TRSB Domain System'
ID: smt-db5e8221-b283-4212-8879-06124ce0cc3b
System for en-et: 'TB English - Estonian NMT'
ID: smt-9b3e3178-f0b3-41db-960d-ed2ce09904e4
System for et-fi: 'Estonian - Finnish (NMT) v2'
ID: smt-c45139d8-f1b1-46c2-9b21-c84c51285b03
System for en-et: 'WMT 2018 EN-ET Corpus fetch'
ID: smt-eb980931-8ebd-49b6-abc5-a9a765b3e1c3
System for fi-en: 'Finnish - English (NMT) Presidency'
ID: smt-48b29c38-c6c9-49c3-ab2e-2bfc1c0d1b73
System for pl-en: 'FACTSET PL-EN'
ID: smt-f311f0ac-ee44-49e0-9116-252db1e4a0c7
System for de-sk: 'ASPENA - KAUFLAND - Food [data fetch] - Copy'
ID: smt-ca7533a9-e20d-4cd8-99f8-3db94ff03e0e
System for en-lt: 'Tilde Localization. English-Lithuanian IT - Part 2 (Microsoft corpora only)'
ID: smt-db2da569-c3e6-404b-9dfb-f895d8d0d991
System for en-ro: 'ro-en-test - Copy'
ID: smt-e63d1422-dc59-450f-93ac-0fde3520ac3a
System for en-lt: 'English - Lithuanian IT (v6.0) (Oracle) – Decode XML entities' OFF'
ID: smt-8fcd5f95-1c4d-4796-98b0-d6ebdec3a47e
System for lt-ru: 'TB Lithuanian - Russian (General) v4'
ID: smt-84c2eb73-da09-48ce-8cb9-71b61f37483c
System for de-sk: 'ASPENA - KAUFLAND - Food [data fetch]'
ID: smt-7ae1e77c-e879-4ecd-94ea-1c2338e1700d
System for en-de: 'Austrian presidency back-translated En to mono De corpus-clean'
ID: smt-1967e392-5c07-44c2-9abe-cbd7f1fcdea4
System for en-et: 'IT system'
ID: smt-f401997e-a928-4f47-b926-2cb0e31491f5
System for en-lv: 'Angļu - Latviešu (Vispārēja) - Sockeye Transformer'
ID: smt-f19dd79f-9399-4dd0-bf36-f5de9c8b21c4
System for en-mt: '[EN-MT] Data Fetch & Phrase Tables Build'
ID: smt-fcc2dc3c-4330-457a-a3c2-69a6e2e7d0b0
System for lt-de: 'Dummy for Lt-De'
ID: smt-edd70df9-88aa-451f-ab54-e61116f0b43f
System for lt-fr: 'Lithuanian - French (NMT) LT-MT3'
ID: smt-9696b5ba-e3a4-420c-b740-916051c5c4e7
System for en-et: 'English - Estonian IT with Dynamic Learning Disabled - Moses 13.10'
ID: smt-94d82eeb-53af-4b6d-8699-f0fc88465c22
System for lv-en: 'Latvian - English (NMT) Linearis 2'
ID: smt-06dfb135-62b2-4380-bae2-1c6f2663e32b
System for en-fr: 'EN-FR skelets TRSB'
ID: smt-2bb8ca98-b9a5-4086-8635-e7d113f087be
System for sl-en: 'GORR Finance SL-EN data fetch'
ID: smt-731ea48e-869c-4b29-b785-25a5da260e7c
System for ru-lt: 'TB Russian - Lithuanian (General) v4'
ID: smt-b05093cb-7395-4fb9-bd15-5fff77d78413
System for en-lv: 'ERAF-MT2 • Mono LV GENERAL (ziņu) jomas korpusi'
ID: smt-556825d1-0ac9-4589-9305-dad3f592da74
System for en-et: 'English - Estonian IT (v5.5)'
ID: smt-789b5215-4e11-49ab-a483-827c18b7f85c
System for en-de: 'En-De for Corpus Fetch'
ID: smt-7e965f8e-3bfc-4c1b-bfee-5858ce0afc01
System for lt-en: 'TB Lithuanian - English (General) v4'
ID: smt-b459424c-0135-4835-b483-39d6bdc49bf8
System for en-lt: 'LT Corpora Inventory'
ID: smt-a78a814c-38db-46e8-a2b9-e155f8a33ed5
System for en-zh: 'AP English-Chinese V6 (Traditional HK)'
ID: smt-55b87498-4932-406a-a4c2-ee0edf61d78b
System for lv-ru: 'Lv-Ru (General) NMT'
ID: smt-7bbc8ee2-1f91-409a-8fea-b470d4f3af4d
System for lt-en: 'WMT19 LT-EN SMT Baseline'
ID: smt-b888ccf9-b5c5-4188-900a-cb9fc57fd89b
System for sv-fi: 'Swedish - Finnish data fetch'
ID: smt-42b154c9-1989-433c-8ee8-a5347796ba04
System for de-nb: 'German-Norwegian MT -- do we have data for this?'
ID: smt-041e1cb5-5409-4ca1-a2a3-d544bd8ff353
System for es-en: 'Spanish - English (Doppler Labs) - v1.2'
ID: smt-dd630d3f-7aff-4610-b8b6-f01cd3b2c49c
System for en-et: 'TB English - Estonian (General) v2.2'
ID: smt-1a0ec013-48c1-4f39-a0b7-382cb27ca387
System for en-et: 'English - Estonian IT (v5.4)'
ID: smt-9d3fa2b2-dd0d-493f-b9f5-f4873917ea83
System for en-et: 'English - Estonian IT (v5.5.1)'
ID: smt-e38939b0-dedd-4091-865d-da204cbe5897
System for en-lv: 'English - Latvian VOLVO'
ID: smt-573a52cb-8728-455a-b05c-ac73865b053f
System for en-lv: 'English - Latvian IT (EN-LV v5.8) without eval&dev'
ID: smt-4bfa5c33-640a-4148-a707-3b3f373e1a87
System for ja-en: 'Japanese - English V3 (NMT) TLS Translations'
ID: smt-e3c86941-564d-47e0-8a33-5fb4504def52
System for sv-en: 'Swedish - English NMT CircleK new'
ID: smt-abf49352-0264-4480-bcfb-a7c7fabd2b0f
System for pl-en: 'Diuna General [data fetch]'
ID: smt-a5b64a69-fc93-464b-b39f-c981492eaf45
System for en-fi: 'Finnish Presidency Data Fetch'
ID: smt-8984b580-6ad5-43b8-a26d-03ff9c34a84c
System for en-pl: 'EVAL EN-PL - General Eval'
ID: smt-aebde704-ef7d-4b92-b402-33dd902dba5d
System for pl-lt: 'PL-LT - dummy'
ID: smt-8960939a-3bee-43c9-94cc-a26960acdd69
System for en-ja: 'English-Japanese SMT'
ID: smt-df1afba0-5e90-41d6-a6f0-f654b2be261d
System for en-lv: 'English - Latvian (NMT) Linearis 2'
ID: smt-a6f1cbc5-1d9f-4c4d-8d4d-6b0699446112
System for en-is: 'en-is dictionary'
ID: smt-5384173b-5fc5-45a1-a105-f0b5a322797c
System for en-lv: 'EN-LV SMT Demo System'
ID: smt-8b6f6a0e-552b-4dce-9724-ff83ac221b0f
System for en-ja: 'English-Japanese full new data fetch'
ID: smt-1ace62d7-8b65-4a6e-9688-8fd4992f856f
System for es-en: 'LYNX ES-EN Legal Data Fetch'
ID: smt-552534dd-9f7a-4cc1-b786-860abd435450
System for lt-en: 'LT-EN SMT Baseline System for LT-MT3'
ID: smt-88b7ea88-f540-4318-ad17-1a27ece043bd
System for en-lv: 'ODINE - EMA test. Tilde EMEA 2014 only'
ID: smt-63ebdc9c-c87d-458f-addb-84372d571b14
System for lv-en: 'ERAF-MT LV-EN (vispārējas jomas) v0.4'
ID: smt-5abbb6ca-f956-44df-823f-9c32848bc806
System for en-ar: 'AP English-Arabic v2.1'
ID: smt-f630c1c4-3267-49d1-983c-f42397620aaf
System for lv-en: '[LMI] LV-EN data fetch for tuning'
ID: smt-db0d1550-f559-448d-9d64-189dc5acae93
System for ru-lv: 'Ru-Lv (General) NMT'
ID: smt-95501b3b-1b31-4d90-b115-c3543f9149cc
System for en-pl: 'EN-PL General SMT'
ID: smt-cc09723e-1fb6-421e-8bb6-581c57f041bb
System for en-pl: '[BIRETA] EN-PL NMT System'
ID: smt-71fdb154-11d9-49e8-b461-a22efbc2fe67
System for et-en: 'eesti - inglise EU Presidency NMT system'
ID: smt-85a613e5-5b6f-473a-84a4-d3fdfb0d187e
System for en-et: '[EE EU Presidency 2017] EN-ET - CyberSecurity v2'
ID: smt-ff2d7d25-dc65-4eaa-8129-0f67a7d5f547
System for et-ru: 'bEstMT ET-RU - November 24, 2017 - NMT MLSTM'
ID: smt-4a9a21c4-0d0d-49f1-a0b1-e4a4b7f6ab4b
System for de-lt: 'Danpower - DE-LT - for Data Fetch'
ID: smt-1798a2b3-b4f1-495d-86b5-2ef0fbdb53af
System for lt-en: 'Lithuanian - English IT (v1)'
ID: smt-c87a0f5c-6761-4f52-836f-ae8cc942eab2
System for de-en: 'Austrian presidency mono data repaired newlines - English'
ID: smt-2c74acbf-c936-4fed-80fd-86c07e1c381d
System for en-lt: 'English - Lithuanian (General) v5.1'
ID: smt-e47b1bce-e32c-41f9-81e6-f4dd9ef04dcc
System for en-pl: 'English - Polish INTERTEXT Medical (NMT)'
ID: smt-856eccfb-9b6d-4f2a-a5fc-9ef6ca4dc897
System for lt-en: 'LT-MT3 LT-EN IT Comparable SMT System'
ID: smt-39cf557b-b8ff-4925-a7b8-bd7d6083a591
System for en-pt: '_SeproTec: English-Portuguese'
ID: smt-794c6133-ce99-4fb6-be49-1d74ba809560
System for nl-en: 'LYNX NL-EN Energy Data Fetch'
ID: smt-1f78b65a-8d9d-49d8-8cff-3f13b4e75540
System for lt-en: 'Lithuanian - English (NMT) Seimas v2'
ID: smt-c9fb0b01-4414-4404-9698-80c5d424f0c6
System for lv-ru: 'TB LV-RU v2.0'
ID: smt-1094dff0-c98e-47f3-8a04-17056da39850
System for fr-lt: 'FR-LT SMT Baseline System for LT-MT3'
ID: smt-667c7342-6eaf-42d5-8320-41520bd9dce9
System for en-lv: 'EESC corpus test baseline. EESC only'
ID: smt-6c9098ac-dd42-41f1-9fe3-a6df9a761528
System for lt-en: 'Lithuanian - English (NMT) WMT 2019'
ID: smt-29ca7818-628c-4bc6-a67a-18b9dd3d4cf9
System for lv-en: 'LV-EN Legal Baseline (NMT)'
ID: smt-6a218329-b9e8-42e8-a9da-f6d30da81619
System for en-fi: 'English - Finnish (NMT) Presidency'
ID: smt-e081b525-3a5e-4e4c-9f93-de46a2c04fa4
System for en-lt: 'EN-LT (General) NMT - for testing only'
ID: smt-2b29ab02-1782-4913-af99-563eb1ff73dc
System for de-lv: 'German - Latvian (NMT) v2'
ID: smt-f53f243f-bbe5-4bd2-b217-79d08595d40d
System for pl-lt: 'Polish - Lithuanian (NMT) LT-MT3'
ID: smt-3a22d36d-26d5-40b5-bec8-f24bfdd6e4d1
System for en-lv: 'EESC corpus test baseline. JRC Acquis only'
ID: smt-bdc8d1f4-4fba-4d28-97bb-ea9a3ca003f5
System for en-lv: 'TB2015 EN-LV v02.2 - more data'
ID: smt-1faaa188-3781-4e50-a678-cfec86ba5b5c
System for fi-sv: 'FI-SV dummy '
ID: smt-cb64a2f8-f605-439c-a023-86ddf9799845
System for de-lv: 'DE-LV data fetch '
ID: smt-30864045-8536-4b3b-8428-64181b569e3e
System for en-da: 'EN-DA AGA MT Pilot - Dummy '
ID: smt-56c6b9ed-eedd-40a4-bdb4-feb428a180ad
System for lt-en: 'LT-EN (General) NMT - for testing only'
ID: smt-0c5e395d-aea0-403d-bea4-373bed908804
System for en-lv: 'English - Latvian IT (EN-LV v5.7)'
ID: smt-050450e3-e8bd-4c6a-a15f-000c63a0d1b1
System for fr-da: 'TextMinded fr-da - v1'
ID: smt-9ab3ce95-b37e-441a-b5f5-57d902b88e6a
System for lv-ru: 'ERAF-MT2 • Mono RU KULTŪRAS jomas korpusi'
ID: smt-43089f56-cbfa-484c-a43d-e5563800bc85
System for en-de: 'Austrian presidency mono data repaired newlines - German'
ID: smt-913ab605-1d0a-491f-90c6-c244fc1ca72a
System for de-en: 'Austrian presidency mono data Only good newlines - English'
ID: smt-f82c73ac-e587-41a4-a542-67e7754b1aa1
System for da-en: 'DA-EN AGA MT Pilot - Dummy'
ID: smt-5f0140f3-ecc9-4956-a01e-5c9a0d7066ce
System for fr-lt: '[LT-MT3] Systems to build data for Probability dictionaries - Factorize'
ID: smt-d04e50a5-8565-4bd5-b365-adab890f5587
System for en-es: 'Codex Global - English-Spanish SMT'
ID: smt-dd249f8c-e27c-46cf-80d4-87034d3b680b
System for en-lt: '[WIP] English - Lithuanian IT (v6) DYNAMIC'
ID: smt-ffe84252-53f9-4113-b454-cab725b1b1d6
System for en-zh: 'AP English-Chinese V4 (Traditional HK)'
ID: smt-312c4042-8f46-4247-bdd8-3dcd278e678c
System for en-lv: 'Test 1'
ID: smt-2ccb025c-e94d-47b1-b16a-08a0b3704fb0
System for de-fr: '[Hieronymus] DE-FR data fetch'
ID: smt-f4c844eb-bf54-438a-b778-ffc01d22350a
System for en-ru: 'English - Russian - WMT 2014'
ID: smt-c9aa2618-b830-4faa-ae3e-6c2ce80ff312
System for lv-en: 'Medicine LV-EN'
ID: smt-5ef15791-ab78-476e-ae50-b6714f46e096
System for en-lt: 'LT-MT3 EN-LT legal domain fetch '
ID: smt-448603ad-5cea-4dbe-ba28-44ac77ba5d38
System for en-sv: 'NLG English-Swedish NMT System'
ID: smt-01dff081-808e-4eca-aa92-b8c395d570b3
System for en-lv: 'TB2016 EN-LV v0.1 (more data)'
ID: smt-362b463d-2191-4349-a10f-7f9403709515
System for en-de: 'Austrian presidency - En-De - NMT'
ID: smt-0bd67c10-8317-45ac-9c4a-841844f2af7e
System for en-lt: 'English - Lithuanian IT (v4) - For QE Model Training - Trial 2'
ID: smt-f426c59c-f72d-4d6a-ba4e-1b0c57f3ba2d
System for en-lv: '[OLD] English - Latvian IT (EN-LV v5.3)'
ID: smt-748a8ffe-8482-402b-9c19-c75b642e10de
System for et-de: 'Dummy for Et-De'
ID: smt-b3f4f9b1-0e61-465d-995a-495d75ad877c
System for en-lv: 'English - Latvian IT (for ORACLE experiment 3)'
ID: smt-de588210-d084-409a-ac1f-9dd2c3613ee5
System for en-et: 'English - Estonian IT - Microsoft (v5.0)'
ID: smt-ecb67be9-6abc-49f6-9864-304adc965433
System for en-lv: 'Angļu - Latviešu (Vispārēja) - Test System (MLSTM)'
ID: smt-6ebbc31f-d814-4087-8589-10b8292159a2
System for de-fr: 'German - French (NMT Ensemble) Hieronymus'
ID: smt-8fe555d8-305e-40b4-873a-abf9c82daff9
System for en-lv: 'TB English - Latvian NMT'
ID: smt-5eed677a-31c8-4a2e-bddc-f90c3e6ad96e
System for en-lv: 'EESC corpus test. JRC Acquis + DCEP + DGT + Europarl + EESC'
ID: smt-a04fb95e-ad88-453d-963a-cae5d985c2f6
System for en-pl: '[INTERTEXT] data fetch 1'
ID: smt-f4271fc3-adf0-474d-8ec3-583ad7ba2654
System for de-en: 'DE-EN General System for Kothes'
ID: smt-af3c3f6f-1d1f-4250-a651-90708cb63aa6
System for en-lt: 'English - Lithuanian IT (v6.0) (Oracle) – Decode XML entities' ON'
ID: smt-b4ecf41e-75ee-4d22-9161-5cc28bf51d67
System for en-ja: 'English - Japanese v2 (NMT)'
ID: smt-b8369b5f-61f5-40d4-83d6-cc41cdf43212
System for de-en: 'De-En for Corpus Fetch'
ID: smt-c2931afb-fa4c-4dde-8746-7764323fe27f
System for en-lv: 'EESC corpus test baseline. DGT'
ID: smt-06f67a55-a867-4c8c-a09d-259238898e2a
System for et-en: 'Estonian-English tests on LetsMT5. v1 - baseline'
ID: smt-33ff38b8-2cfc-4c81-b816-6a33cd908116
System for en-lv: 'Linearis En-Lv - NMT'
ID: smt-2495e05c-7593-4508-852e-80bac4e5bbdc
System for en-bg: 'EU Presidency English-Bulgarian NMT (adapted)'
ID: smt-2f65da7c-6b4f-4c29-bd47-ada26111c861
System for en-lv: 'English - Latvian IT (EN-LV v5.0)'
ID: smt-1063a6f4-3a24-4fb2-b2fd-57abce86d5fb
System for en-sl: 'English - Slovenian (NMT) - GORR'
ID: smt-ba509496-174a-4926-b400-62d54f8a79d5
System for en-pl: 'EVAL EN-PL - GetIT - General eval'
ID: smt-192ab252-8bd1-40af-88e5-9bfd65c3bc62
System for en-lt: 'Seimas - En-Lt - data fetch'
ID: smt-829145b1-0e50-4458-8468-bb7c54c8f047
System for et-de: 'Estonian - German (NMT) v0.2'
ID: smt-75c66b39-691e-4d60-a075-5004401e5a24
System for lv-en: 'LV-EN Legal Baseline'
ID: smt-2bc36025-c46d-4a6a-844a-437daf89d40c
System for lt-en: 'Seimas - Lt-En - NMT'
ID: smt-4814ebbd-b354-4a9c-83c8-8dbf64f60d44
System for en-de: 'En-De [non-tuned NMT]'
ID: smt-99b2f71a-1b3b-418e-bd6b-125f61a53feb
System for en-lv: 'English - Latvian IT (EN-LV v4.2)'
ID: smt-c2adb90f-ee32-442a-b81a-3666b6124dc6
System for fr-lt: '[LT-MT3] Systems to build data for Probability dictionaries '
ID: smt-26d26f4e-78a9-4ef5-9d17-3153a4683dc0
System for en-lv: 'ODINE - EMA test. EMA UNIQUE only'
ID: smt-a959f31b-b9f5-4749-8157-4cf0a074b4f4
System for en-et: 'English - Estonian. Medicine domain'
ID: smt-afdafea3-e2d0-4574-b091-42e4f837ae19
System for fi-et: 'Finnish - Estonian (NMT) v2'
ID: smt-8a5bbd0f-96c4-42c3-869f-db15abe406ca
System for lt-en: 'WMT19 LT-EN mono corpus fetch'
ID: smt-c8e4778a-13a1-4977-8085-3eebc30a103c
System for en-sv: 'English - Swedish NMT CircleK new'
ID: smt-f2e35605-1d5e-4d69-8664-d27a71a1ac26
System for et-lv: 'Estonian-Latvian baseline'
ID: smt-30eab85e-8ac2-44b4-bcd6-a985f696bec8
System for en-lv: 'ERAF-MT EN-LV (valsts pārvalde) v0.4'
ID: smt-7108bcc9-0646-4b4a-87ac-d240977b9380
System for et-en: 'TB Estonian - English (General) v2.1'
ID: smt-f9b2dea6-628f-440b-b4a6-7e25f08f2470
System for en-lt: 'English - Lithuanian IT (v6) (Oracle)'
ID: smt-960179d7-604c-41d1-b8a0-01c5b78b2f39
System for en-ja: 'English - Japanese (NMT)'
ID: smt-db5ae0b8-a8a7-4af6-b44f-6a840dc57b73
System for pl-en: 'Polish - English Pharmacy (NMT) Kontekst'
ID: smt-5a4c0518-826b-443d-9dd6-b8ca2de5fd49
System for sv-en: 'Swedish - English (NMT) CircleK'
ID: smt-11379275-2632-4928-9388-6ae7dd1e012d
System for en-lt: 'WMT19 EN-LT Unlimited data fetch'
ID: smt-e62e8863-dfb2-41e3-92d4-0d2e9a010cd2
System for sv-en: 'Circle K Data Fetch'
ID: smt-d4531484-8cc9-45a9-89db-35f3d14cbfce
System for lv-ru: 'ERAF-MT2 • Mono TIESLIETU jomas korpusi '
ID: smt-6717b290-947d-4389-bf2c-164603e4ce2b
System for en-lv: 'English - Latvian IT (for ORACLE experiment 2)'
ID: smt-b31e32e5-c83c-4de5-b14a-41d4755603dc
System for es-en: 'Spanish - English (Doppler Labs) - v1.3'
ID: smt-548f46b6-38bb-4fdf-b328-aa81546fee8a
System for en-lt: 'English - Lithuanian IT (Philips) v5'
ID: smt-9442ad67-6422-4c3a-b376-2735e064dd29
System for bg-en: 'EU Presidency Bulgarian-English NMT System (initial)'
ID: smt-5831a8d1-9657-4c45-b657-9797fc4ba8e2
System for en-ro: 'Romanian Presidency - dummy system'
ID: smt-85756325-c426-4659-85aa-405a79094bca
System for en-nl: 'English - Dutch (NMT) Lynx'
ID: smt-8fc59d9e-5566-4e35-af4b-98382578cdf2
System for de-fr: 'German - French (NMT) web-tuned Hieronymus'
ID: smt-1ebfb927-c218-45bf-9313-e9cacfd1350a
System for en-ja: 'English-Japanese MT -- do we have data for this?'
ID: smt-f8beacdc-6472-48e0-adfc-bcfee293d1d2
System for en-et: '[EE EU Presidency 2017] EN-ET - CyberSecurity'
ID: smt-88898ff2-bf11-4972-b980-50c49d9d295a
System for en-fr: 'EN-FR (Canadian) System'
ID: smt-0bc72257-75c3-4064-bd8f-31bfa61aa9d0
System for en-cs: 'Presto EN->CS auto-moto'
ID: smt-36bedbaa-b155-43af-b347-5d9c03832acb
System for en-lv: 'English - Latvian IT (EN-LV v5.4)'
ID: smt-167d7a46-5aec-4d4d-8fdd-6508ddbe6eb8
System for et-en: 'Estonian-English Sockeye NMT System (WMT 2018)'
ID: smt-9ac1c0de-233b-4614-ac69-b1a092bb8f9e
System for en-et: '[EE EU Presidency 2017] EN-ET - Presidency'
ID: smt-978024f2-0701-434d-9e64-6f62ed78a59f
System for en-lv: 'Demonstration System'
ID: smt-2ad189ca-4818-477d-9df6-2ed9321102d3
System for en-nl: 'LYNX EN-NL dummy '
ID: smt-3f2f3a32-0dc7-4f76-9855-a75de937e6f8
System for en-lv: 'Linearis'
ID: smt-b4fc9d46-fd63-45b7-94f4-ab4ac03fa31f
System for en-pl: 'EN-PL Getit-NMT/Google evaluation'
ID: smt-2f15ddda-47fd-4613-9efd-aa9e274b4e32
System for ru-lt: 'Dummy'
ID: smt-74a9bdf7-6a4d-4d05-8b11-52becbba4691
System for en-es: 'ITU - Telecommunications EN-ES NMT'
ID: smt-cf7ff971-ec56-40e2-bb32-1a03973fa56f
System for en-pl: '[BIRETA] EN-PL Data Fetch'
ID: smt-589477c0-85c6-4915-9dc9-5291b9dd3c6e
System for lv-en: 'Rail Baltica Lv-En'
ID: smt-a2980bfc-4c71-47d6-aa70-0d101f8ed287
System for en-lv: 'EESC corpus test baseline. Europarl only'
ID: smt-81d0377e-15a8-48dd-a0d1-1d3afbc1e622
System for de-en: 'mono-translated.de-en - corpus clean'
ID: smt-a97cea8d-3111-488b-84cd-8ca563b5ad8e
System for en-sv: 'CircleK En-Sv Dummy'
ID: smt-8bcf699c-1949-49c6-8c15-d0f8a79336dd
System for et-en: 'Estonian - English (General) v2.1 - Copy'
ID: smt-1616f2f1-40c5-4443-9d5a-721a98046cd0
System for en-es: 'Chess English-Spanish Data Fetch'
ID: smt-bf18fc36-d13b-4af0-a829-4bbebe9b88f2
System for en-pl: 'English - Polish Clinical trials (NMT) Kontekst'
ID: smt-eb12fe16-c5e4-471b-81d6-9528ec41aa45
System for nl-en: 'FREME Dutch-English Legal'
ID: smt-9b69c17c-7c51-4cc9-94d7-8160460c9bc4
System for lv-de: 'LV-DE dummy'
ID: smt-19229494-a44f-4a40-84c2-cea6e30fa00b
System for en-lt: 'English - Lithuanian IT (v6) - Copy'
ID: smt-867a0958-e380-4215-a1a9-0ae790aef9af
System for en-lv: 'Oracle'
ID: smt-8f35f4fb-37ae-4e19-b86a-fc931f430a7b
System for lv-ru: 'LV-RU (Valsts pārvalde) baseline + with-for MultiUN LV-RU evaluation - Copy'
ID: smt-7777f5da-3a19-40c9-98c4-8c580825aca3
System for en-nb: 'STP EN-NO NMT'
ID: smt-67bfee6d-94e2-4a72-9b41-95dea91c136e
System for en-pl: 'EVAL EN-PL - GetIT'
ID: smt-2509b5cf-cbb6-4c23-8931-c80463b455db
System for ru-lt: 'Russian - Lithuanian (NMT) LT-MT3'
ID: smt-46a7fe0e-e1d3-4fd0-ab21-f6d40db609f3
System for pl-en: 'GET IT - Pl-En - NMT'
ID: smt-59d71f23-a43b-442a-84c3-327aedfcfec6
System for et-fi: 'Estonian - Finnish (NMT) v1'
ID: smt-65c04708-bf14-49d6-89b0-d81d6d5b7161
System for lv-de: 'Latvian - German (NMT) v2'
ID: smt-867c0dcd-1ae3-4a9f-9dbc-17f8fd956e52
System for es-ro: '_SeproTec: Spanish-Romanian'
ID: smt-2f22196a-66a4-4709-acfd-7b57c57bc911
System for en-lv: 'Test System'
ID: smt-1cd5bac8-45f3-4b5b-a79d-17985e9d3774
System for et-en: 'TB Estonian - English NMT'
ID: smt-fea9837f-2179-4057-be22-e5cf1c4a316e
System for fr-en: 'FR-EN Baseline NMT'
ID: smt-c24d97d3-536b-4e2d-8810-f2e46a4a897d
System for lt-en: 'LT-MT3 LT-EN IT domain fetch'
ID: smt-4093256f-c36f-47d1-ad63-c8895c4e8578
System for en-lt: 'English - Lithuanian IT'
ID: smt-94aa3716-9ac3-4588-bf6b-b838145feeff
System for lv-en: 'TB2016 LV-EN v01.1 Date localisation'
ID: smt-2830e860-c135-4f5c-917c-7d3d669f9419
System for fr-sv: 'TextMinded fr-sv - v1'
ID: smt-3d4fc9fa-08c8-4402-8d09-6b0bde84d20e
System for en-lv: 'EESC corpus test. JRC Acquis + EESC'
ID: smt-2e246eb8-ba9b-4a28-b7b6-a1fa0b6c865a
System for en-pl: 'EVAL EN-PL'
ID: smt-ad84638a-5e79-4be9-9f7d-5d550bfae42e
System for en-ar: 'AP English-Arabic v1'
ID: smt-d7570198-5d62-4700-8af1-7f96ad711557
System for en-lv: 'English - Latvian IT (EN-LV v5.2)'
ID: smt-2c4b290c-76a3-45ee-8096-0d5b167ebfd2
System for ja-en: 'Japanese - English (NMT)'
ID: smt-5dab723e-195e-468d-a7b5-69e6e63fe2aa
System for lt-pl: 'LT-MT3 LT-PL Data Fetch'
ID: smt-b0f78069-4d5f-40b9-9508-4638f02bd591
System for en-da: 'Amesto EN-DK baseline eval using client's eval corpus'
ID: smt-38eca5c1-f060-4ae1-ad22-1aa8230bf5b5
System for en-ro: 'English - Romanian (NMT) Presidency'
ID: smt-d57b1605-598b-46a8-8ad9-4b8e2499b9cf
System for en-lt: 'English - Lithuanian (General)'
ID: smt-9ec7846e-23d2-4f6a-82d2-fa7be380b19a
System for en-ga: 'en-ga-test'
ID: smt-c836202f-7400-4145-b5b1-dcbafc5d8448
System for lv-en: 'Latviešu - Angļu (Vispārēja) - Sockeye NMT'
ID: smt-707fe5ce-98f4-46ae-b01a-03070a0db25c
System for lt-ru: '[LT-MT3] Systems to build data for Probability dictionaries'
ID: smt-7d77024a-c759-4b0b-8868-129e54316625
System for de-lt: 'German - Lithuanian (NMT) LT-MT3'
ID: smt-8b22a71a-00e9-4e10-88f0-d68e9530da59
System for en-lv: 'English - Latvian IT (v7.0) - for NMT training'
ID: smt-4e4fa6e2-abba-4056-ab81-6e407d077b01
System for et-en: 'Estonian - English (General) v2.2'
ID: smt-0e4c813d-e2be-4f2d-8889-d6bb99d36099
System for lv-en: 'Linearis LV-EN - data fetch'
ID: smt-59bc1198-606e-4a36-8c3f-bb07722f6aa2
System for en-lv: 'ERAF-MT2 • Mono TIESLIETU jomas korpusi'
ID: smt-00f9a76e-7b4d-48a3-b1b7-07be4f172db5
System for en-et: 'Oracle'
ID: smt-9030dfed-1121-49e0-a427-2e648bf59feb
System for en-sl: 'Amidas En-Sl'
ID: smt-abe37866-81f2-4863-a358-b620ad8a4160
System for et-fi: 'Et-Fi Dummy system'
ID: smt-5121ff36-70a2-4f4e-b37c-0a88c32973fd
System for en-et: 'EN-ET-Marian-NMT - Updated Files'
ID: smt-73a303d6-6e20-4869-89cc-a275304357bb
System for en-lv: 'TB2015 EN-LV v02.1'
ID: smt-505293d9-9b8f-441f-bea4-d830f179ee18
System for en-et: 'Celsius EN-ET-SMT'
ID: smt-6e36d478-f2ce-4ad5-9b02-ab8da68947d7
System for de-lv: 'DE-LV data fetch2'
ID: smt-8b3c5570-8646-477d-a691-5883a3f9bc98
System for en-ar: 'TB English-Arabic v1 - NMT (production)'
ID: smt-c4a3aa54-92b0-48de-88fd-feabdfab6219
System for de-fr: '[Hieronymus] DE-FR In-Domain NMT System'
ID: smt-98090b03-4be1-494e-a771-6d5b33af6078
System for fi-en: 'Stockmann FI-EN data fetch'
ID: smt-c2a94829-e4bf-4264-a1bc-16b1a6d63795
System for nl-en: 'FREME Dutch-English Legal - v1.1'
ID: smt-12fc5829-3071-4eec-b07d-d7abb8ae1bc7
System for lv-en: 'Latviešu - Angļu (Mežu nozares)'
ID: smt-6b3b1e56-84c9-43f6-86c9-cf18e946b1bd
System for en-fi: 'STP EN-FI NMT (preliminary)'
ID: smt-0e03379f-cfb3-46ba-9397-e6c6c901e808
System for en-sk: 'Codex Global - English-Slovak SMT'
ID: smt-353633c8-aee0-4f41-b7c2-558944d6d5cc
System for fi-et: 'Finnish - Estonian (NMT) v1'
ID: smt-e3f36b28-ef22-42f8-a4fe-da48acd41418
System for en-et: 'English - Estonian IT with Dynamic Learning Enabled'
ID: smt-e15e39d7-61fd-49f2-a50f-d232fb76d4a9
System for en-lt: 'English - Lithuanian IT (v5.7) (Oracle)'
ID: smt-2c5c8950-a46c-4a2f-8f5f-8e1920e62bdd
System for lt-en: 'Lithuanian - English (NMT) IT LT-MT3'
ID: smt-88f0a69d-ad3d-4b6d-bd02-abc89cbecc85
System for en-lt: 'Seimas - En-Lt - NMT'
ID: smt-0964dacf-d3e2-4b00-b36d-f9bfda082389
System for lt-en: 'LT-EN (General) NMT'
ID: smt-67631dc9-0bb7-4999-b12d-eac6ae189db9
System for en-et: 'English - Estonian IT (v5.3)'
ID: smt-effb3bd7-08e3-40b3-9bad-da62b4e394b9
System for en-lv: 'TB2015 EN-LV v02.1'
ID: smt-5599bc15-6f44-4b89-a465-4930fb2ba47f
System for en-lt: 'English - Lithuanian IT (v6.0) (Oracle) – Decode XML entities' ON – bez vecajiem LocDoc'
ID: smt-ab369469-901f-49e8-a852-b4614d376695
System for lv-en: 'ERAF-MT2 angļu kultūra'
ID: smt-4da2a903-6e2a-4ab8-961b-f8c1b8e5e16d
System for de-lt: '[LT-MT3] Systems to build data for Probability dictionaries '
ID: smt-0deac4c5-3131-49c1-ae34-0bccca424c71
System for en-et: '[OBSOLETE] inglise - eesti NMT system'
ID: smt-35abecbd-565f-44e3-9999-b6decc5a6eac
System for da-en: 'Danish - English (NMT) AGA'
ID: smt-8c11e4ce-91cc-4ae0-8cac-f0fea4302361
System for en-sv: 'English - Swedish (NMT) CircleK'
ID: smt-c6bb32b3-5d1a-456a-89bf-698a397ad736
System for lt-ru: 'Lt-Ru (General) NMT'
ID: smt-41d3bdfd-e58a-4879-917b-9cb9c5705888
System for en-et: 'Automotive'
ID: smt-ceedf681-ecfd-469e-b737-a252975bffd4
System for da-en: 'Yet another DA-EN baseline v2'
ID: smt-aebcc383-65c6-499b-bfe6-6d5e2a10fe81
System for lv-en: 'Linearis - Lv-En - NMT'
ID: smt-d1455a59-2299-4eb2-b87f-f29071ea9815
System for en-pl: 'English - Polish INTERTEXT Legal & Finance (NMT)'
ID: smt-7fc23609-86f7-4199-a438-1e2e902f4bd3
System for fr-lt: 'French - Lithuanian (NMT) Legal LT-MT3'
ID: smt-690a7c17-c78b-47c4-9127-7230f6e0db32
System for en-lt: 'English - Lithuanian IT (v5.6) (Oracle)'
ID: smt-d7304849-f9c7-4320-87fd-4124427926ab
System for en-lt: 'English - Lithuanian (NMT) WMT 2019'
ID: smt-31f11613-ab4d-43d9-a828-88a98e49d4af
System for en-ro: 'Romanian Presidency - backtranslation data fetch 1'
ID: smt-c8ded83e-8906-4000-8227-2a44a4eaca9a
System for lv-de: 'Latvian - German (NMT)'
ID: smt-f967086d-18e2-4bf3-a4c7-467ab2079598
System for en-lv: 'TB2016 EN-LV v0.1 (less data)'
ID: smt-4bf418df-aaba-4e7e-8047-a2ad55a2b382
System for de-fr: 'German - French (NMT) Hieronymus'
ID: smt-2dd7addb-7a9e-4c1c-8e99-bd2fb12b9245
System for ru-lt: 'Seimas - Ru-Lt - NMT'
ID: smt-617f16f7-1a2c-4305-90a1-70a13bc04f81
System for fi-et: 'FI-ET v0.1'
ID: smt-578d3487-2b96-4032-8b43-a9bd8a583f0a
System for et-en: '[EE EU Presidency 2017] ET-EN - Presidency'
ID: smt-3e62fc51-6df8-4808-9a7f-2149f5fdc4dd
System for pl-en: 'Polish - English (NMT) Diuna (Law & Finance)'
ID: smt-e20e02e8-97c8-40cf-b8de-6db214fd64eb
System for en-lt: 'English - Lithuanian (NMT) IT LT-MT3'
ID: smt-2212985c-28b4-4d84-a1f4-f00ac332874f
System for ja-en: 'Japanese-English SMT'
ID: smt-85b341b2-c40a-4d1b-9297-f247b4f6c22a
System for en-fr: 'EN-FR Baseline NMT'
ID: smt-9fc9ddbe-a714-4281-9e7b-3ebc55e1fad6
System for en-da: 'Yet another EN-DA baseline'
ID: smt-c71c7ccd-37a2-401c-9447-6f18b75cf18a
System for en-ro: 'ro-en-test'
ID: smt-61b820f7-b955-4676-b517-09e32898fd51
System for en-lt: 'English - Lithuanian (NMT) Seimas v2'
ID: smt-df7a882a-1146-4fad-8422-31158a025556
System for de-en: 'DE-EN General System for Kothes - Copy'
ID: smt-93fbccee-71b2-4101-87ad-12c335c27652
System for pl-en: 'Polish - English General (NMT)'
ID: smt-c9b2c200-2370-4e1d-a1da-9e1f489a4217
System for fr-lt: 'FR-LT LT-MT3 - Comparable SMT System'
ID: smt-eb5873b3-9211-496c-9102-0fc2201572d4
System for en-pl: 'English - Polish Pharmacy (NMT) Kontekst'
ID: smt-4ee45ca7-c633-4a43-85f5-dbca960b822d
System for en-lv: 'ODINE - OPUS EMEA test. OPUS EMEA only'
ID: smt-1c96aedd-b80f-414a-974d-f77feb8e1fc7
System for de-fr: 'Hieronymus DE-FR'
ID: smt-71b92f58-8563-48e7-bc16-cb895958aa3b
System for en-bg: 'EU Presidency English-Bulgarian NMT (initial)'
ID: smt-d44cd645-bc44-4c65-820e-65c4425a0f46
System for lt-en: 'WMT19 LT-EN Data Fetch'
ID: smt-d861be41-8b0b-4ba7-a490-5655ef8fd623
System for en-da: 'Atea - English-Danish NMT'
ID: smt-c3c51e85-7c13-4b70-b314-977890e32fa4
System for en-lv: 'EESC corpus test baseline. DCEP only'
ID: smt-ba6a6583-94e5-431e-b3f4-ac88868450e8
System for fr-lt: 'FR-LT LT-MT3 - data fetch'
ID: smt-edd16e25-c09b-43ad-8884-1dfae01fd2da
System for en-pl: 'Diuna Polish Patents (EN-PL)'
ID: smt-0ea25eca-efae-47b2-8bfb-6d0f1fa3619d
System for en-de: 'Austrian presidency mono data Only good newlines - German'
ID: smt-b408b9d6-d50a-4418-83fa-7a3b8b24b669
System for en-cs: 'English - Czech (NMT) Presto'
ID: smt-5c06cd10-bb5e-43b1-9889-dc40285e3756
System for lv-ru: 'Tests #1 LV-RU 'sinonīmu vārdnīcas' izveidei'
ID: smt-bb583163-9172-4e72-a913-2d1695fd5830
System for en-lt: 'WMT19 EN-LT SMT Baseline V2'
ID: smt-bc6d1bad-1b46-4307-abb9-b95f037d69cd
System for lt-ru: 'LT-MT3 LT-RU Data Fetch'
ID: smt-999f76d0-6ab2-4e2b-ae1d-c575a69012ec
System for lv-en: 'TB Latvian - English (v03)'
ID: smt-f59d9946-924e-47a7-a136-2fe66cfb77ef
System for en-es: 'Printful English-Spanish Data Fetch'
ID: smt-39d8d5f2-8612-4b49-9348-77437a196f82
System for et-en: 'EU Presidency NMT system (ET-EN)'
ID: smt-3ea525f9-e9b4-4b30-8c14-febb66e40f6e
System for ro-en: 'Romanian Presidency data fetch'
ID: smt-28c52ba8-cfb4-424b-8b65-cf43224540a2
System for nl-en: 'Dutch - English (NMT) Lynx'
ID: smt-2eb02c32-1406-45a0-8974-0310becf564b
System for en-fr: 'EN-FR (Canadian) TRSB Adapted System - NMT'
ID: smt-454b7e1f-c3dc-486a-9e63-1541ac8aef50
System for en-es: 'EN-ES [SMT]'
ID: smt-e97495e3-559a-4395-b224-d29e924f7f7c
System for lt-ru: 'IADAATPA LT-RU'
ID: smt-802b1632-95fb-48de-8cd7-e9ddfd116853
System for en-de: 'Codex Global - English-German SMT'
ID: smt-08cd7e6a-6aae-40fb-beba-5101c7b1c5af
System for en-lv: 'EN-LV 2019 data fetch'
ID: smt-c6923cd7-14c9-46ce-995e-bedfc3a1a636
System for lt-fr: 'LT-FR Legal LT-MT3 - data fetch '
ID: smt-2094cec5-9ab2-431f-95ec-11f8d4dff4d4
System for de-en: 'De -En [NMT]'
ID: smt-160de000-f719-4d5b-9daa-34859345e889
System for pl-en: 'Diuna Law & Finance [data fetch]'
ID: smt-50d1a6be-6e39-40fd-8e7a-47250a1edb4e
System for en-lv: 'Angļu - Latviešu (Vispārēja)'
ID: smt-643d5907-82c2-4a1a-849d-ba7822aef036
System for en-pl: 'English - Polish Patents (NMT)'
ID: smt-90034e3b-fbe3-4f88-8d4d-ff37f3699fe0
System for en-lv: 'ERAF-MT EN-LV (Valsts pārvalde) v0.3'
ID: smt-c31ba483-3284-4a28-bb18-2abbbbb3cd05
System for en-ru: 'CHESS game strategy, notes, comments'
ID: smt-bb9115be-804b-42f4-a52b-8008470c2a21
System for en-sl: 'English-Slovenian'
ID: smt-f9686767-aadd-4a5d-8d31-4daf53dd77a9
System for ru-lv: 'TB RU-LV v.02'
ID: smt-95f5a52c-86fe-467a-b15e-4058df157cf9
System for en-pl: 'Kontekst data fetch'
ID: smt-1dd3547e-0929-4985-a4d3-36a6a2218590
System for en-es: 'English - Spanish (NMT) Lynx'
ID: smt-7f098605-5838-4f84-b73e-94af698c3e00
System for en-fr: 'EN-FR Baseline NMT (fast)'
ID: smt-a5f350f1-351c-4767-a12c-5a7fe5c2dfb0
System for fr-en: 'FR-EN Baseline NMT (fast)'
ID: smt-14c7a262-824b-46c1-8d90-841aa38429b9
System for en-et: '[EE EU Presidency 2017] EN-ET - General Domain v2'
ID: smt-a0623ae4-b245-4e4e-a2ee-0c3b29684020
System for en-et: 'Tilde Localization. English-Estonian IT'
ID: smt-e3a26b6a-a6e6-4b7f-b535-cfee07caac8c
System for en-lv: 'ERAF-MT2 • Paralēlie EN-LV-EN korpusi'
ID: smt-afc76215-6de9-44ec-9ee7-e5562dd2bb15
System for en-pl: 'INTERTEXT Polish Medical (EN-PL)'
ID: smt-3339aff2-ffbd-469b-9fb6-483885c0afd6
System for pl-en: 'Polish - English - GetIT - v9 - Legal NMT'
ID: smt-d03e04f7-d662-4e0a-b9ac-f58e57ff55a7
System for de-en: 'Technik DE_ENGB v2'
ID: smt-39f2a60c-d184-4450-a397-3aea517875ca
'''
Build production-ready translation models using mT5, data preparation, model fine-tuning, evaluation metrics and deployment strategies.
What is mT5 and Why Use It for Translation?
mT5 extends Google's T5 architecture to support multilingual tasks. Unlike BERT or GPT models, mT5 treats every problem as text-to-text conversion. This approach works perfectly for translation tasks.
Key Advantages of mT5 Translation Models
Multilingual Support: mT5 handles 101 languages out of the box. You don't need separate models for each language pair.
Transfer Learning: The model learns patterns across languages. Training on high-resource languages improves low-resource translation quality.
Flexible Architecture: The same model architecture works for translation, summarization, and question answering tasks.
Pre-trained Weights: Google provides pre-trained mT5 models. You start with strong baselines instead of random weights.
Prerequisites and Environment Setup
You need Python 3.8+, PyTorch, and the Transformers library. GPU access speeds up training significantly.
# Install required packages
pip install transformers torch datasets evaluate sacrebleu
pip install accelerate wandb # Optional: for training acceleration and logging
# Import essential libraries
import torch
from transformers import (
MT5ForConditionalGeneration,
MT5Tokenizer,
Trainer,
TrainingArguments,
DataCollatorForSeq2Seq
)
from datasets import Dataset, load_dataset
import evaluate
import numpy as np
Check your GPU setup:
# Verify CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name(0)}")
Understanding mT5 Architecture for Translation
mT5 uses an encoder-decoder structure. The encoder processes source text, while the decoder generates target translations.
Text-to-Text Format
mT5 requires specific input formatting. Add task prefixes to guide the model:
# Format examples for different translation directions
def format_translation_input(source_text, source_lang, target_lang):
"""Format input text for mT5 translation"""
prefix = f"translate {source_lang} to {target_lang}: "
return prefix + source_text
# Examples
english_to_french = format_translation_input("Hello world", "English", "French")
spanish_to_english = format_translation_input("Hola mundo", "Spanish", "English")
print(english_to_french) # "translate English to French: Hello world"
print(spanish_to_english) # "translate Spanish to English: Hola mundo"
Data Preparation and Preprocessing
Quality training data determines model performance. We'll use the OPUS dataset, which contains millions of parallel sentences.
Loading Translation Datasets
# Load a sample translation dataset
def load_translation_data(language_pair="en-fr", split="train", max_samples=10000):
"""Load and preprocess translation data"""
# Load OPUS-100 dataset for the language pair
try:
dataset = load_dataset("opus100", language_pair, split=split)
# Limit samples for faster training
if max_samples and len(dataset) > max_samples:
dataset = dataset.select(range(max_samples))
return dataset
except Exception as e:
print(f"Error loading dataset: {e}")
return None
# Load English-French translation data
train_data = load_translation_data("en-fr", "train", 5000)
val_data = load_translation_data("en-fr", "validation", 1000)
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
Data Preprocessing Pipeline
class TranslationDataProcessor:
def __init__(self, tokenizer, source_lang="en", target_lang="fr", max_length=128):
self.tokenizer = tokenizer
self.source_lang = source_lang
self.target_lang = target_lang
self.max_length = max_length
def preprocess_function(self, examples):
"""Preprocess translation examples for training"""
# Extract source and target texts
source_texts = examples['translation'][self.source_lang]
target_texts = examples['translation'][self.target_lang]
# Format inputs with task prefix
inputs = [
f"translate {self.source_lang} to {self.target_lang}: {text}"
for text in source_texts
]
# Tokenize inputs and targets
model_inputs = self.tokenizer(
inputs,
max_length=self.max_length,
truncation=True,
padding=True,
return_tensors="pt"
)
# Tokenize targets
with self.tokenizer.as_target_tokenizer():
labels = self.tokenizer(
target_texts,
max_length=self.max_length,
truncation=True,
padding=True,
return_tensors="pt"
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Initialize tokenizer and processor
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
processor = TranslationDataProcessor(tokenizer, "en", "fr")
# Process datasets
train_dataset = train_data.map(
processor.preprocess_function,
batched=True,
remove_columns=train_data.column_names
)
val_dataset = val_data.map(
processor.preprocess_function,
batched=True,
remove_columns=val_data.column_names
)
Fine-tuning mT5 for Translation
Fine-tuning adapts the pre-trained mT5 model to your specific translation task. We'll use Hugging Face's Trainer class for efficient training.
Model Initialization
# Load pre-trained mT5 model
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
# Move model to GPU if available
model = model.to(device)
print(f"Model parameters: {model.num_parameters():,}")
print(f"Model size: {model.num_parameters() * 4 / 1024**2:.1f} MB")
Training Configuration
# Set up training arguments
training_args = TrainingArguments(
output_dir="./mt5-translation-model",
eval_strategy="steps",
eval_steps=500,
save_steps=1000,
logging_steps=100,
learning_rate=5e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
warmup_steps=500,
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model="eval_bleu",
greater_is_better=True,
fp16=True, # Enable mixed precision training
dataloader_pin_memory=True,
remove_unused_columns=False,
report_to="wandb", # Optional: for experiment tracking
)
Evaluation Metrics Setup
# Load BLEU metric for evaluation
bleu_metric = evaluate.load("sacrebleu")
def compute_metrics(eval_preds):
"""Compute BLEU score for evaluation"""
predictions, labels = eval_preds
# Decode predictions and labels
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# Replace -100 labels with pad token
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Compute BLEU score
result = bleu_metric.compute(
predictions=decoded_preds,
references=[[label] for label in decoded_labels]
)
return {
"bleu": result["score"],
"precisions": result["precisions"],
}
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=model,
padding=True,
return_tensors="pt"
)
Training the Model
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Start training
print("Starting training...")
train_result = trainer.train()
# Save the final model
trainer.save_model()
tokenizer.save_pretrained("./mt5-translation-model")
print(f"Training completed!")
print(f"Final training loss: {train_result.training_loss:.4f}")
Training typically takes 2-4 hours on a modern GPU. Monitor the loss curves to ensure the model converges properly.
Model Evaluation and Performance Testing
Proper evaluation reveals model strengths and weaknesses. We'll use BLEU scores and human-like quality assessments.
Automated Evaluation with BLEU
def evaluate_translation_model(model, tokenizer, test_data, device):
"""Evaluate model performance on test data"""
model.eval()
predictions = []
references = []
with torch.no_grad():
for example in test_data:
# Prepare input
source = example['translation']['en']
target = example['translation']['fr']
input_text = f"translate en to fr: {source}"
# Tokenize input
inputs = tokenizer(
input_text,
return_tensors="pt",
max_length=128,
truncation=True
).to(device)
# Generate translation
outputs = model.generate(
**inputs,
max_length=128,
num_beams=4,
early_stopping=True,
do_sample=False
)
# Decode prediction
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
predictions.append(prediction)
references.append([target])
# Calculate BLEU score
bleu_score = bleu_metric.compute(predictions=predictions, references=references)
return {
"bleu_score": bleu_score["score"],
"predictions": predictions[:5], # First 5 examples
"references": [ref[0] for ref in references[:5]]
}
# Load test data
test_data = load_translation_data("en-fr", "test", 500)
# Evaluate model
results = evaluate_translation_model(model, tokenizer, test_data, device)
print(f"BLEU Score: {results['bleu_score']:.2f}")
print("\nSample Translations:")
for i, (pred, ref) in enumerate(zip(results['predictions'], results['references'])):
print(f"Prediction {i+1}: {pred}")
print(f"Reference {i+1}: {ref}")
print("-" * 50)
Quality Assessment Examples
def translate_text(model, tokenizer, text, source_lang="en", target_lang="fr"):
"""Translate a single text using the fine-tuned model"""
# Format input
input_text = f"translate {source_lang} to {target_lang}: {text}"
# Tokenize
inputs = tokenizer(
input_text,
return_tensors="pt",
max_length=128,
truncation=True
).to(device)
# Generate translation
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=128,
num_beams=4,
temperature=0.7,
do_sample=True,
early_stopping=True
)
# Decode output
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translation
# Test various sentence types
test_sentences = [
"The weather is beautiful today.",
"Can you help me find the nearest restaurant?",
"Machine learning transforms how we solve problems.",
"I love reading books in my free time.",
"The meeting has been postponed until tomorrow."
]
print("Translation Quality Examples:")
for sentence in test_sentences:
translation = translate_text(model, tokenizer, sentence)
print(f"EN: {sentence}")
print(f"FR: {translation}")
print("-" * 60)
Deployment and Production Considerations
Moving from training to production requires optimization for speed and resource usage.
Model Optimization
# Optimize model for inference
def optimize_model_for_inference(model):
"""Apply optimizations for faster inference"""
# Set to evaluation mode
model.eval()
# Compile model (PyTorch 2.0+)
if hasattr(torch, 'compile'):
model = torch.compile(model)
return model
# Create inference pipeline
class TranslationPipeline:
def __init__(self, model_path, device="cuda"):
self.device = device
self.tokenizer = MT5Tokenizer.from_pretrained(model_path)
self.model = MT5ForConditionalGeneration.from_pretrained(model_path)
self.model = self.model.to(device)
self.model = optimize_model_for_inference(self.model)
def translate(self, text, source_lang="en", target_lang="fr", **kwargs):
"""Translate text with optimized pipeline"""
input_text = f"translate {source_lang} to {target_lang}: {text}"
inputs = self.tokenizer(
input_text,
return_tensors="pt",
max_length=128,
truncation=True
).to(self.device)
# Generation parameters
gen_kwargs = {
"max_length": 128,
"num_beams": 4,
"early_stopping": True,
"do_sample": False,
**kwargs
}
with torch.no_grad():
outputs = self.model.generate(**inputs, **gen_kwargs)
translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return translation
# Initialize production pipeline
translator = TranslationPipeline("./mt5-translation-model", device)
# Test production pipeline
sample_text = "Hello, how are you doing today?"
result = translator.translate(sample_text, "en", "fr")
print(f"Production translation: {result}")
API Deployment Example
# Simple Flask API for translation service
from flask import Flask, request, jsonify
import time
app = Flask(__name__)
# Initialize translator (do this once at startup)
translator = TranslationPipeline("./mt5-translation-model")
@app.route('/translate', methods=['POST'])
def translate_api():
"""API endpoint for translation requests"""
try:
data = request.get_json()
# Extract parameters
text = data.get('text', '')
source_lang = data.get('source_lang', 'en')
target_lang = data.get('target_lang', 'fr')
# Validate input
if not text:
return jsonify({'error': 'Text parameter is required'}), 400
# Measure translation time
start_time = time.time()
translation = translator.translate(text, source_lang, target_lang)
processing_time = time.time() - start_time
return jsonify({
'translation': translation,
'source_lang': source_lang,
'target_lang': target_lang,
'processing_time': round(processing_time, 3)
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/health', methods=['GET'])
def health_check():
"""Health check endpoint"""
return jsonify({'status': 'healthy', 'model': 'mT5-translation'})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)
Advanced Techniques and Optimizations
Improve model performance with advanced training strategies and architectural modifications.
Multi-GPU Training
# Distributed training setup
from torch.nn.parallel import DistributedDataParallel
from accelerate import Accelerator
def setup_distributed_training():
"""Configure multi-GPU training"""
accelerator = Accelerator()
# Updated training arguments for distributed training
training_args = TrainingArguments(
output_dir="./mt5-distributed",
per_device_train_batch_size=4, # Smaller batch per GPU
gradient_accumulation_steps=4, # Effective batch size = 4*4*num_gpus
dataloader_pin_memory=True,
ddp_find_unused_parameters=False,
**training_args.__dict__ # Inherit other arguments
)
return accelerator, training_args
Curriculum Learning
def create_curriculum_dataset(dataset, difficulty_fn, stages=3):
"""Create curriculum learning dataset"""
# Calculate difficulty scores
difficulties = [difficulty_fn(example) for example in dataset]
# Sort by difficulty
sorted_indices = np.argsort(difficulties)
# Create stages
stage_size = len(dataset) // stages
curriculum_stages = []
for i in range(stages):
start_idx = i * stage_size
end_idx = (i + 1) * stage_size if i < stages - 1 else len(dataset)
stage_indices = sorted_indices[start_idx:end_idx]
curriculum_stages.append(dataset.select(stage_indices))
return curriculum_stages
def sentence_difficulty(example):
"""Simple difficulty metric based on sentence length"""
source_len = len(example['translation']['en'].split())
target_len = len(example['translation']['fr'].split())
return max(source_len, target_len)
Common Issues and Troubleshooting
Building translation models involves several potential pitfalls. Here are solutions to common problems.
Memory Management
# Handle CUDA out of memory errors
def handle_memory_issues():
"""Strategies for managing GPU memory"""
# Clear cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Reduce batch size
training_args.per_device_train_batch_size = 4
training_args.gradient_accumulation_steps = 4
# Enable gradient checkpointing
training_args.gradient_checkpointing = True
# Use FP16 training
training_args.fp16 = True
print("Applied memory optimization settings")
# Monitor GPU memory usage
def monitor_gpu_memory():
"""Track GPU memory consumption"""
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
cached = torch.cuda.memory_reserved() / 1024**3
print(f"GPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB")
Model Performance Issues
# Debug poor translation quality
def debug_model_performance(model, tokenizer, problem_examples):
"""Analyze model behavior on problematic examples"""
for example in problem_examples:
source = example['source']
expected = example['target']
# Get model prediction
prediction = translate_text(model, tokenizer, source)
# Analyze tokenization
source_tokens = tokenizer.tokenize(f"translate en to fr: {source}")
target_tokens = tokenizer.tokenize(expected)
print(f"Source: {source}")
print(f"Expected: {expected}")
print(f"Predicted: {prediction}")
print(f"Source tokens ({len(source_tokens)}): {source_tokens}")
print(f"Target tokens ({len(target_tokens)}): {target_tokens}")
print("-" * 80)
# Example problematic cases
problem_cases = [
{"source": "Bank", "target": "Banque"}, # Ambiguous word
{"source": "The bank is closed", "target": "La banque est fermée"},
{"source": "I bank on you", "target": "Je compte sur toi"}
]
debug_model_performance(model, tokenizer, problem_cases)
Comparison with Other Translation Approaches
Understanding mT5's position in the translation landscape helps you make informed decisions.
mT5 vs Traditional Statistical Methods
Statistical Machine Translation (SMT) relies on phrase tables and language models. These systems require extensive parallel corpora and struggle with long-range dependencies.
mT5 Advantages:
- Handles context better through attention mechanisms
- Requires less manual feature engineering
- Transfers knowledge across languages
- Adapts to domain-specific terminology through fine-tuning
mT5 vs Other Neural Approaches
Sequence-to-Sequence Models with LSTM/GRU architectures preceded transformers. They suffer from vanishing gradients and limited context windows.
BERT-based Translation uses encoder-only architecture. This approach requires additional decoder components and complex training procedures.
mT5 Benefits:
- Unified text-to-text framework
- Pre-trained on massive multilingual data
- Consistent performance across language pairs
- Simpler fine-tuning process
Performance Benchmarks and Results
Real-world performance data helps set expectations for your mT5 translation models.
BLEU Score Expectations
| Language Pair | mT5-Small | mT5-Base | mT5-Large |
|---|---|---|---|
| EN-FR | 28.5 | 32.1 | 35.7 |
| EN-DE | 25.2 | 28.9 | 32.4 |
| EN-ES | 31.8 | 35.2 | 38.6 |
| EN-ZH | 22.1 | 25.7 | 29.3 |
Training Time and Resource Requirements
| Model Size | Parameters | Training Time | GPU Memory | Inference Speed |
|---|---|---|---|---|
| mT5-Small | 300M | 2-4 hours | 8GB | 50 tokens/sec |
| mT5-Base | 580M | 6-8 hours | 16GB | 35 tokens/sec |
| mT5-Large | 1.2B | 12-16 hours | 32GB | 20 tokens/sec |
Benchmarks based on 10k training samples, NVIDIA V100 GPU
Future Improvements and Extensions
Your mT5 translation model can grow more sophisticated with additional techniques.
Multilingual Extensions
# Support multiple language pairs in one model
def create_multilingual_dataset(language_pairs):
"""Combine datasets for multiple language pairs"""
combined_dataset = []
for source_lang, target_lang in language_pairs:
pair_data = load_translation_data(f"{source_lang}-{target_lang}")
# Add language pair information
for example in pair_data:
example['source_lang'] = source_lang
example['target_lang'] = target_lang
combined_dataset.append(example)
return Dataset.from_list(combined_dataset)
# Create multilingual training data
language_pairs = [("en", "fr"), ("en", "de"), ("en", "es"), ("fr", "de")]
multilingual_data = create_multilingual_dataset(language_pairs)
Domain Adaptation
# Fine-tune for specific domains
def create_domain_specific_data(domain="medical"):
"""Load domain-specific translation data"""
domain_datasets = {
"medical": "medical_translation_corpus",
"legal": "legal_translation_corpus",
"technical": "technical_translation_corpus"
}
# Load domain-specific data
# Implementation depends on your data sources
pass
# Gradual domain adaptation
def gradual_domain_adaptation(model, general_data, domain_data, steps=3):
"""Gradually adapt model to specific domain"""
# Step 1: Train on general data
# Step 2: Mix general and domain data (80:20)
# Step 3: Focus on domain data (20:80)
pass
Conclusion
Building translation models with mT5 transforms complex multilingual challenges into manageable engineering tasks. You've learned to prepare datasets, fine-tune models, evaluate performance, and deploy production systems.
Key takeaways from this guide:
Start Small: Use mT5-small for prototyping. Scale up to larger models once you validate your approach.
Data Quality Matters: Clean, diverse training data produces better translations than large volumes of noisy text.
Evaluation is Critical: BLEU scores provide baselines, but human evaluation reveals real-world quality.
Optimize for Production: Model compression, caching, and hardware acceleration make deployment viable.
The mT5 architecture handles 100+ languages with consistent quality. Your translation models can now bridge communication gaps across global audiences.
Ready to build your first mT5 translator? Start with the environment setup and work through each section. The code examples provide working implementations you can adapt to your specific needs.
Next Steps: Experiment with different language pairs, explore domain adaptation techniques, and consider implementing real-time translation APIs for your applications.

