:- module(snowball_label_match, [ snowball_label_match/3 ]). :- use_module(library(debug)). :- use_module(library(option)). :- use_module(library(snowball)). :- use_module(library(semweb/rdf11)). :- use_module(library(amalgame/lit_distance)). :- use_module(library(amalgame/vocabulary)). :- use_module(string_match_util). snowball_label_match(align(Source, Target, Prov0), align(Source, Target, [Prov|Prov0]), Options) :- rdf_equal(amalgame:label,DefaultP), option(snowball_language(Snowball_Language), Options, english), option(prefix(PrefixLength), Options, 4), option(sourcelabel(MatchPropS), Options, DefaultP), option(targetlabel(MatchPropT), Options, DefaultP), option(matchacross_lang(MatchAcross), Options, true), option(source_language(Lang),Options, any), option(edit_distance(Edit_Distance), Options, 0), ( Lang == 'any' -> SourceLang = _Unbound ; SourceLang = Lang ), skos_has(Source, MatchPropS, SourceLabel@SourceLang, SourceProp, Options), SourceLabel \= '', % If we can't match across languages, set target language to source language ( MatchAcross == false -> TargetLang = SourceLang ; true ), ( sub_atom(SourceLabel, 0, PrefixLength, _, Prefix) -> true ; Prefix=SourceLabel ), downcase_atom(SourceLabel, SourceLabel0), snowball(Snowball_Language, SourceLabel0, SourceStem), % Target candidate generation based on prefixes... % This should be replaced by hash lookup on preprocessed stem table FIXME % Current implementation can miss stemmed matches because the prefix of the unstemmed labes do not match % backtrack over all target candidates with prefix match: rdf11:{ prefix(PrefixLabel, Prefix) }, skos_has(Target, MatchPropT, PrefixLabel@TargetLang, _TargetProp, Options), % backtrack over all labels of the current target candidate: skos_has(Target, MatchPropT, TargetLabel@TargetLang, TargetProp, Options), ( option(target_scheme(TargetScheme), Options) -> vocab_member(Target, TargetScheme) ; true ), downcase_atom(TargetLabel, TargetLabel0), snowball(Snowball_Language, TargetLabel0, TargetStem), ( Edit_Distance == 0 -> TargetStem == SourceStem, Distance = 0 ; literal_distance(SourceStem, TargetStem, Distance), Distance =< Edit_Distance ), Match is 1 / (1 + Distance), Prov = [method(snowball), prefix(Prefix), source_stem(SourceStem), target_stem(TargetStem), match(Match), graph([rdf(Source, SourceProp, SourceLabel@SourceLang), rdf(Target, TargetProp, TargetLabel@TargetLang)]) ], debug(align_result, 'snowball match: ~p ~p', [Source,Target]).