Source code for mlconjug3.feature_extractor.feature_extractor

"""
This module declares the feature extractors for verbs.

A custom vectorizer optimized for extracting verb features,
including n-grams of verb endings and beginnings, verb length,
number of vowels and consonants, and ratio of vowels to consonants.
"""

import re
from mlconjug3.constants import ALPHABET


[docs]def extract_verb_features(verb, lang, ngram_range): """ | Custom Vectorizer optimized for extracting verbs features. | As in Indo-European languages verbs are inflected by adding a morphological suffix, the vectorizer extracts verb endings and produces a vector representation of the verb with binary features. | To enhance the results of the feature extration, several other features have been included: | The features are the verb's ending n-grams, starting n-grams, length of the verb, number of vowels, number of consonants and the ratio of vowels over consonants. :param verb: string. Verb to vectorize. :param lang: string. Language to analyze. :param ngram_range: tuple. The range of the ngram sliding window. :return features: list. List of the most salient features of the verb for the task of finding it's conjugation's class. """ _white_spaces = re.compile(r"\s\s+") verb = _white_spaces.sub(" ", verb) verb = verb.lower() verb_len = len(verb) length_feature = "LEN={}".format(str(verb_len)) min_n, max_n = ngram_range final_ngrams = [ "END={}".format(verb[-n:]) for n in range(min_n, min(max_n + 1, verb_len + 1)) ] initial_ngrams = [ "START={}".format(verb[:n]) for n in range(min_n, min(max_n + 1, verb_len + 1)) ] if lang not in ALPHABET: lang = "en" # We chose 'en' as the default alphabet because english is more standard, without accents or diactrics. vowels = sum(verb.count(c) for c in ALPHABET[lang]["vowels"]) vowels_number = "VOW_NUM={}".format(vowels) consonants = sum(verb.count(c) for c in ALPHABET[lang]["consonants"]) consonants_number = "CONS_NUM={}".format(consonants) if consonants == 0: vow_cons_ratio = "V/C=N/A" else: vow_cons_ratio = "V/C={}".format(round(vowels / consonants, 2)) final_ngrams.extend(initial_ngrams) final_ngrams.extend( (length_feature, vowels_number, consonants_number, vow_cons_ratio) ) return final_ngrams
if __name__ == "__main__": pass