Source code for mlconjug3.feature_extractor.feature_extractor

"""
This module declares the feature extractors for verbs.

A custom vectorizer optimized for extracting verb features,
including n-grams of verb endings and beginnings, verb length,
number of vowels and consonants, and ratio of vowels to consonants.
"""

import re
from mlconjug3.constants import ALPHABET


[docs]def extract_verb_features(verb, lang, ngram_range):
    """
    | Custom Vectorizer optimized for extracting verbs features.
    | As in Indo-European languages verbs are inflected by adding a morphological suffix,
     the vectorizer extracts verb endings and produces a vector representation of the verb with binary features.
    | To enhance the results of the feature extration, several other features have been included:
    | The features are the verb's ending n-grams, starting n-grams, length of the verb, number of vowels,
     number of consonants and the ratio of vowels over consonants.

    :param verb: string.
        Verb to vectorize.
    :param lang: string.
        Language to analyze.
    :param ngram_range: tuple.
        The range of the ngram sliding window.
    :return features: list.
        List of the most salient features of the verb for the task of finding it's conjugation's class.
    """
    _white_spaces = re.compile(r"\s\s+")
    verb = _white_spaces.sub(" ", verb)
    verb = verb.lower()
    verb_len = len(verb)
    length_feature = "LEN={}".format(str(verb_len))
    min_n, max_n = ngram_range
    final_ngrams = [
        "END={}".format(verb[-n:]) for n in range(min_n, min(max_n + 1, verb_len + 1))
    ]
    initial_ngrams = [
        "START={}".format(verb[:n]) for n in range(min_n, min(max_n + 1, verb_len + 1))
    ]
    if lang not in ALPHABET:
        lang = "en"  # We chose 'en' as the default alphabet because english is more standard, without accents or diactrics.
    vowels = sum(verb.count(c) for c in ALPHABET[lang]["vowels"])
    vowels_number = "VOW_NUM={}".format(vowels)
    consonants = sum(verb.count(c) for c in ALPHABET[lang]["consonants"])
    consonants_number = "CONS_NUM={}".format(consonants)
    if consonants == 0:
        vow_cons_ratio = "V/C=N/A"
    else:
        vow_cons_ratio = "V/C={}".format(round(vowels / consonants, 2))
    final_ngrams.extend(initial_ngrams)
    final_ngrams.extend(
        (length_feature, vowels_number, consonants_number, vow_cons_ratio)
    )
    return final_ngrams


if __name__ == "__main__":
    pass