Compiled blog: Parse text colum from mtConcepts of a SDLTB file with Python XML or BeautifulSoup

Sunday, January 5, 2025

Parse text colum from mtConcepts of a SDLTB file with Python XML or BeautifulSoup

from bs4 import BeautifulSoup

# XML data

xml_data = """<cG><c>1</c><trG><tr type="origination">letco</tr><dt>2020-08-30T19:12:50</dt></trG><trG><tr type="modification">letco</tr><dt>2020-08-30T19:12:58</dt></trG><lG><l lang="DE" type="German"/><tG><t>Pelletpresse</t><trG><tr type="origination">letco</tr><dt>2020-08-30T19:12:50</dt></trG><trG><tr type="modification">letco</tr><dt>2020-08-30T19:12:50</dt></trG></tG></lG><lG><l lang="RO" type="Romanian"/><tG><t>presă de peleți</t><trG><tr type="origination">letco</tr><dt>2020-08-30T19:12:58</dt></trG><trG><tr type="modification">letco</tr><dt>2020-08-30T19:12:58</dt></trG></tG></lG></cG>"""

# Parse the XML

soup = BeautifulSoup(xml_data, "xml")

# Function to extract translation details

def parse_translation(translation_element):

lang = translation_element.find("l").attrs

text = translation_element.find("t").text

transactions = [

{

"type": tr.find("tr")["type"],

"actor": tr.find("tr").text,

"datetime": tr.find("dt").text,

}

for tr in translation_element.find_all("trG")

]

return {

"lang": lang["lang"],

"type": lang["type"],

"text": text,

"transactions": transactions,

}

# Extract the main data

data = {

"c": soup.find("c").text,

"transactions": [

{

"type": tr.find("tr")["type"],

"actor": tr.find("tr").text,

"datetime": tr.find("dt").text,

}

for tr in soup.find_all("trG", recursive=False)

"translations": [parse_translation(lG) for lG in soup.find_all("lG")],

}

# Output the extracted data

print(data)

from xml.etree import ElementTree as ET

# Parse the XML

root = ET.fromstring(xml_data)

# Function to extract translation details

def parse_translation(translation_element):

lang = translation_element.find("./l").attrib

text = translation_element.find("./tG/t").text

transactions = [

{

"type": tr.find("./tr").attrib["type"],

"actor": tr.find("./tr").text,

"datetime": tr.find("./dt").text,

}

for tr in translation_element.findall("./tG/trG")

]

return {

"lang": lang["lang"],

"type": lang["type"],

"text": text,

"transactions": transactions,

}

# Extract the main data

data = {

"c": root.find("./c").text,

"transactions": [

{

"type": tr.find("./tr").attrib["type"],

"actor": tr.find("./tr").text,

"datetime": tr.find("./dt").text,

}

for tr in root.findall("./trG")

"translations": [parse_translation(lG) for lG in root.findall("./lG")],

}

# Output the extracted data

print(data)

Compiled blog

Pages

Sunday, January 5, 2025

Parse text colum from mtConcepts of a SDLTB file with Python XML or BeautifulSoup

Show IP and Country

Search This Blog

LinkedIn Profile

About Me

Useful Links

Blog Archive

Tags

2Performant

ProZ.com Jobs

TranslatorsCafe.com: Recent Translation Jobs

TranslatorsTown.com

Total Pageviews

Popular Posts

SmartCAT

Wikipedia

Google Translate

2performant

Compiled blog

Pages

Sunday, January 5, 2025

Parse text colum from mtConcepts of a SDLTB file with Python XML or BeautifulSoup

Show IP and Country

Search This Blog

LinkedIn Profile

About Me

Useful Links

Blog Archive

Tags

2Performant

ProZ.com Jobs

TranslatorsCafe.com: Recent Translation Jobs

TranslatorsTown.com

Total Pageviews

Popular Posts

Subscribe To

SmartCAT

Wikipedia

Google Translate

2performant