Sunday, January 5, 2025

Parse text colum from mtConcepts of a SDLTB file with Python XML or BeautifulSoup

 from bs4 import BeautifulSoup

# XML data

xml_data = """<cG><c>1</c><trG><tr type="origination">letco</tr><dt>2020-08-30T19:12:50</dt></trG><trG><tr type="modification">letco</tr><dt>2020-08-30T19:12:58</dt></trG><lG><l lang="DE" type="German"/><tG><t>Pelletpresse</t><trG><tr type="origination">letco</tr><dt>2020-08-30T19:12:50</dt></trG><trG><tr type="modification">letco</tr><dt>2020-08-30T19:12:50</dt></trG></tG></lG><lG><l lang="RO" type="Romanian"/><tG><t>presă de peleți</t><trG><tr type="origination">letco</tr><dt>2020-08-30T19:12:58</dt></trG><trG><tr type="modification">letco</tr><dt>2020-08-30T19:12:58</dt></trG></tG></lG></cG>"""


# Parse the XML

soup = BeautifulSoup(xml_data, "xml")


# Function to extract translation details

def parse_translation(translation_element):

lang = translation_element.find("l").attrs

text = translation_element.find("t").text

transactions = [

{

"type": tr.find("tr")["type"],

"actor": tr.find("tr").text,

"datetime": tr.find("dt").text,

}

for tr in translation_element.find_all("trG")

]

return {

"lang": lang["lang"],

"type": lang["type"],

"text": text,

"transactions": transactions,

}


# Extract the main data

data = {

"c": soup.find("c").text,

"transactions": [

{

"type": tr.find("tr")["type"],

"actor": tr.find("tr").text,

"datetime": tr.find("dt").text,

}

for tr in soup.find_all("trG", recursive=False)

],

"translations": [parse_translation(lG) for lG in soup.find_all("lG")],

}


# Output the extracted data

print(data)



from xml.etree import ElementTree as ET


# Parse the XML

root = ET.fromstring(xml_data)


# Function to extract translation details

def parse_translation(translation_element):

lang = translation_element.find("./l").attrib

text = translation_element.find("./tG/t").text

transactions = [

{

"type": tr.find("./tr").attrib["type"],

"actor": tr.find("./tr").text,

"datetime": tr.find("./dt").text,

}

for tr in translation_element.findall("./tG/trG")

]

return {

"lang": lang["lang"],

"type": lang["type"],

"text": text,

"transactions": transactions,

}


# Extract the main data

data = {

"c": root.find("./c").text,

"transactions": [

{

"type": tr.find("./tr").attrib["type"],

"actor": tr.find("./tr").text,

"datetime": tr.find("./dt").text,

}

for tr in root.findall("./trG")

],

"translations": [parse_translation(lG) for lG in root.findall("./lG")],

}


# Output the extracted data

print(data)