Monday, October 2, 2023

Python Convert TSV to TMX

Given the tsv (tab delimited file) file data.csv with this content:

English\tRomanian
Hello my name is John.\tSalut, numele meu e John.
Today is Monday.\tAzi e luni.

import pandas as pd
import datetime

def tsv_t_tmx(myfile='data.csv', source_lang='en', target_lang='ro', separator='\t'):
  now = datetime.datetime.now()
  d2 = now.strftime("%Y-%m-%d %H:%M:%S")
  df = pd.read_csv(myfile, sep=separator)
  f = open("demofile2.xml", "w", encoding="UTF-8")

  f.write('''<?xml version="1.0" encoding="UTF-8" ?>
             <tmx version="1.4">''')
  f.write(f'''<header creationdate="{d2}"
             srclang= "{source_lang}"
             adminlang="en"
             o-tmf="unknown"
             segtype="sentence"
             creationtool="Python"
             creationtoolversion="unknown"
             datatype="PlainText" />
      <body>\n''')

  for index, row in df.iterrows():
    eng = row['English']
    rom = row['Romanian']
      
    f.write(f'''
        <tu>
         <tuv xml:lang="{source_lang}">
            <seg>{eng}</seg>
         </tuv>
        <tuv xml:lang="{target_lang}">
           <seg>{rom}</seg>
         </tuv>
        </tu>''')

  f.write('''
      </body>
            </tmx>''')

  f.close()  
  base_file = "demofile2.xml"
  name, ext = base_file.split('.')
  new_file = '{}.{}'.format(name, 'tmx')

  with open(base_file , 'r') as f1:
      with open(new_file, 'w') as f2:
          f2.write(f1.read())  

if __name__ == "__main__":
  tsv_t_tmx()  

*****

TMX model:

<tmx version="1.4"><header creationtool="" creationtoolversion="" segtype="phrase" o-tmf="" adminlang="en" srclang="en" datatype="PlainText" o-encoding="UTF-8" /><body><tu><tuv xml:lang="en"><seg /></tuv></tu><tu><tuv xml:lang="en"><seg /></tuv></tu><tu><tuv xml:lang="en"><seg /></tuv></tu></body></tmx>

*****

<?xml version="1.0" encoding="UTF-8" ?>
             <tmx version="1.4"><header creationdate="2023-03-16 20:30:30"
             srclang= "en"
             adminlang="en"
             o-tmf="unknown"
             segtype="sentence"
             creationtool="Python"
             creationtoolversion="3.11"
             datatype="PlainText" />
      <body>

        <tu>
         <tuv xml:lang="en">
            <seg>Hello my name is John.</seg>
         </tuv>
        <tuv xml:lang="ro">
           <seg>Salut, numele meu e John.</seg>
         </tuv>
        </tu>
        <tu>
         <tuv xml:lang="en">
            <seg>Today is Monday.</seg>
         </tuv>
        <tuv xml:lang="ro">
           <seg>Azi e luni.</seg>
         </tuv>
        </tu>
      </body>
            </tmx>