Source code for mlconjug3.PyVerbiste.PyVerbiste

"""
PyVerbiste.

This module contains the code for the class Vrbiste.
More information about mlconjug3 at https://pypi.org/project/mlconjug3/


The conjugation data conforms to the XML schema defined by Verbiste.
More information on Verbiste at https://perso.b2b2c.ca/~sarrazip/dev/conjug_manager.html

"""

__author__ = "Ars-Linguistica"
__author_email__ = "diao.sekou.nlp@gmail.com"


import os
import joblib
import copy
import defusedxml.ElementTree as ET
import json
from collections import OrderedDict
import pkg_resources
from mlconjug3.constants import *
from mlconjug3.verbs import *
from mlconjug3.conjug_manager import *


[docs]class Verbiste(ConjugManager): """ This is the class handling the Verbiste xml files. :param language: string. | The language of the conjugator. The default value is fr for French. | The allowed values are: fr, en, es, it, pt, ro. :ivar language: Language of the conjugator. :ivar verbs: Dictionary where the keys are verbs and the values are conjugation patterns. :ivar conjugations: Dictionary where the keys are conjugation patterns and the values are inflected forms. :ivar _allowed_endings: set. | A set containing the allowed endings of verbs in the target language. :ivar templates: list of strings. List of the conjugation patterns. """ def _load_cache(self, file): file_path = os.path.abspath(file) if not file_path.endswith(".xml"): raise ValueError(f"Invalid file path, expected .xml file, got {file_path}") pkl_file = file_path + ".pkl" if os.path.isfile(pkl_file): last_modified_time_file = os.path.getmtime(file_path) last_modified_time_pkl = os.path.getmtime(pkl_file) if last_modified_time_file <= last_modified_time_pkl: file_dic = joblib.load(pkl_file) return file_dic else: return None
[docs] def _load_verbs(self, verbs_file): """ Load and parses the verbs from the xml file. :param verbs_file: string or path object. Path to the verbs xml file. """ self.verbs = self._parse_verbs(verbs_file.replace("json", "xml")) return
[docs] def _parse_verbs(self, file): """ Parses the XML file. :param file: FileObject. XML file containing the verbs. :return verb_templates: OrderedDict. An OrderedDict containing the verb and its template for all verbs in the file. """ cache = self._load_cache(file) if cache: return cache verbs_dic = {} xml = ET.parse(file) for verb in xml.findall("v"): verb_name = verb.find("i").text template = verb.find("t").text index = -len(template[template.index(":") + 1 :]) root = verb_name if index == 0 else verb_name[:index] verbs_dic[verb_name] = {"template": template, "root": root} pkl_file = file + ".pkl" joblib.dump(verbs_dic, pkl_file, compress=("gzip", 3)) return verbs_dic
[docs] def _load_conjugations(self, conjugations_file): """ Load and parses the conjugations from the xml file. :param conjugations_file: string or path object. Path to the conjugation xml file. """ self.conjugations = self._parse_conjugations( conjugations_file.replace("json", "xml") ) return
[docs] def _parse_conjugations(self, file): """ Parses the XML file. :param file: FileObject. XML file containing the conjugation templates. :return conjugations: OrderedDict. An OrderedDict containing all the conjugation templates in the file. """ cache = self._load_cache(file) if cache: return cache conjugations_dic = {} xml = ET.parse(file) for template in xml.findall("template"): template_name = template.get("name") conjugations_dic[template_name] = OrderedDict() for mood in list(template): conjugations_dic[template_name][mood.tag] = OrderedDict() for tense in list(mood): conjugations_dic[template_name][mood.tag][ tense.tag.replace("-", " ") ] = self._load_tense(tense) pkl_file = file + ".pkl" joblib.dump(conjugations_dic, pkl_file, compress=("gzip", 3)) return conjugations_dic
[docs] @staticmethod def _load_tense(tense): """ Load and parses the inflected forms of the tense from xml file. :param tense: list of xml tags containing inflected forms. The list of inflected forms for the current tense being processed. :return inflected_forms: list. List of inflected forms. """ persons = list(tense) if not persons: return None elif len(persons) == 1: if persons[0].find("i") is None: return None conjug = persons[0].find("i").text else: conjug = [] for pers, term in enumerate(persons): if term.find("i") is not None: if term.find("i").text is not None: conjug.append((pers, term.find("i").text)) else: conjug.append((pers, "")) else: conjug.append((pers, None)) return conjug
if __name__ == "__main__": pass