Merging two lexicon graphs
(: XQUERY for processing LG data :)
(: Thorsten Trippel August 2004 :)
declare namespace my="my-functions.uri";
declare variable $largelexinfile as xs:string external;
declare variable $smalllexinfile as xs:string external;
(: Function disambiguate-id: tests if an id is already
in use somewhere else, :)
(: and if so adds as many letters a to make it unique :)
declare function my:disambiguate-id
($ambiguousid as xs:string, $largelexitems as node()* )
as xs:string
{
if ($ambiguousid = $largelexitems/@lexid) then
my:disambiguate-id(concat($ambiguousid,"a"),$largelexitems)
else ($ambiguousid)
};
(: Transformation of the lexicon:
Takes a large LG lexicon and a smalllexicon. :)
(: Every element of the small one is tested against
the large one and if not already in there is added. :)
(: Before the addition the ids are adjusted.
The references from the relations are adjusted as well :)
(: Every lexitem of the large LG lexicon is copied if it is
not part of the small lexicon :)
(: roles of lexicons can be interchanged, with consequences
on performance, maybe :)
(: Definition of the lexicon files:)
let $largelex := doc($largelexinfile), (:LG_complete.xml :)
$smalllex := doc($smalllexinfile) (:LG_to_be_merged.xml:)
(: This is the lexitems from the large lexicon :)
let $largelexitems := $largelex/LG/lexitems/lexitem
(: These variable is used in processing the relations :)
let $largelexrelations := $largelex/LG/relations/relation
let $largelex_relation_target_id_type_concat :=
for $aux11 in $largelexrelations
return <concat_idref_type>{
concat($aux11/target/@idref,
$aux11/@type)}</concat_idref_type>
(: This is the processing of the lexical items :)
(: First all lexical items from the small lexicon are tagged
if they are already in the large one;
types are not taken into account here :)
let $potentiallexitemsmall :=
<all_lexitems_smalllex>{
(: Process all elements of the small lexicon
into ones that are not redundant and others
that could be candidates for redundancies :)
for $potentiallexitem in $smalllex/LG/lexitems/lexitem
return
if ($potentiallexitem/text() =
$largelexitems/text()) then
let $concataux :=
for $largelexitems_test in $largelexitems
where $largelexitems_test/text()=
$potentiallexitem/text()
return
<concataux>
{
concat($largelexitems_test/text(),
$largelexitems_test/@type)}
</concataux>
return
if (concat($potentiallexitem/text(),
$potentiallexitem/@type)=$concataux) then
for $largelexitems_test in $largelexitems
where
$largelexitems_test/text()=
$potentiallexitem/text() and
$largelexitems_test/@type =
$potentiallexitem/@type
return
<redundant>{<lexitem
type="{$potentiallexitem/@type}"
lexid="{$largelexitems_test/@lexid}"
orglexid="{$potentiallexitem/@lexid}"
>
{$potentiallexitem/text()}</lexitem>}
</redundant>
else
<nonredundant>
{
<lexitem
type="{$potentiallexitem/@type}"
lexid=
"{my:disambiguate-id($potentiallexitem/@lexid,
$largelexitems)}"
orglexid="{$potentiallexitem/@lexid}"
>
{$potentiallexitem/text()}</lexitem>}
</nonredundant>
else
<nonredundant>
{<lexitem
type="{$potentiallexitem/@type}"
lexid="{my:disambiguate-id($potentiallexitem/@lexid,
$largelexitems)}"
orglexid="{$potentiallexitem/@lexid}">
{$potentiallexitem/text()}
</lexitem>
}
</nonredundant>
}
</all_lexitems_smalllex>
(: Here the lexitems that are definitively not identical are
processed :)
let $alllexitems :=
<lexitems>
{
(: Select lexitems having non identical texts
and adjust identifiers :)
for $nonredundant in
$potentiallexitemsmall/nonredundant/lexitem
return $nonredundant
}
{
(: Here the typing is applied but only for the items that
might be candidates, :)
(: i.e. their content is in both lexicons :)
for $largelexitem in $largelexitems
return $largelexitem
}
</lexitems>
(: In the smalllexicon the identifiers in the relations
need to be adjusted as well :)
let $modifiedrelations :=
<relations>{
for $target in $smalllex/LG/relations/relation/target
return
<relation type="{$target/../@type}" >
{
if ($potentiallexitemsmall//lexitem/@orglexid =
$target/@idref) then
for $alllexitems_4rel in $potentiallexitemsmall//lexitem
where $alllexitems_4rel/@orglexid = $target/@idref
return <target idref="{$alllexitems_4rel/@lexid}"
orgidref="{$target/@idref}"/>
else <target idref="{$target/@idref}"
orgidref="unique"/>
}
{
for $source in $target/../source
return
if ($potentiallexitemsmall//lexitem/@orglexid =
$source/@idref)
then
for $alllexitems_4rel in
$potentiallexitemsmall//lexitem
where $alllexitems_4rel/@orglexid = $source/@idref
return <source idref="{$alllexitems_4rel/@lexid}"
orgidref="{$source/@idref}"/>
else <source idref="{$source/@idref}"
orgidref="unique"/>
}
</relation>
}
</relations>
(: With the adjusted relations the relations can
now be processed :)
let $allrelations :=
<relations>
{
(: First the intersection of both lexicons :)
for $largelexrelation in $largelexrelations
let $smalllex_intersect_rel :=
$modifiedrelations/relation
let $large_relation_target_id_type_concat :=
concat($largelexrelation/target/@idref,
$largelexrelation/@type)
let $smalllex_relation_target_id_type_concat :=
for $smalllex_concat_aux in
$smalllex_intersect_rel/target
return <concat_aux2>
{concat($smalllex_concat_aux/@idref,
$smalllex_concat_aux/../@type)}
</concat_aux2>
where $large_relation_target_id_type_concat =
$smalllex_relation_target_id_type_concat
(: This is the intersection of the relations from the target
perspective :)
return
<relation type="{$largelexrelation/@type}">
<target idref="{$largelexrelation/target/@idref}"/>
{
(: Now the source items need to be checked for redundancies :)
let $idrefs :=
for $aux1 in
$smalllex_intersect_rel/source/@idref,
$aux2 in $largelexrelation/source/@idref
return <source>{$aux1}{$aux2}</source>
return
for $aux3 in distinct-values($idrefs/@idref)
return <source idref="{$aux3}"/>
}
</relation>
}
{
(: Now the ones only present in the large lexicon :)
for $largelexrelation in $largelexrelations
let $large_relation_target_id_type_concat :=
concat($largelexrelation/target/@idref,
$largelexrelation/@type)
let $small_relation_target_id_type_concat :=
for $aux4 in $modifiedrelations/relation/target
return
<concat_idref_type>
{
concat($aux4/@idref,$aux4/../@type)}
</concat_idref_type>
where not($small_relation_target_id_type_concat/text() =
$large_relation_target_id_type_concat)
return $largelexrelation
}
{
(: Finally the ones only in the small lexicon :)
for $smalllexrelation in $modifiedrelations/relation/target
let $small_relation_target_id_type_concat :=
concat($smalllexrelation/@idref,
$smalllexrelation/../@type)
let $largelex_relation_target_id_type_concat :=
for $aux5 in $largelexrelations
return
<concat_idref_type>{
concat($aux5/target/@idref,
$aux5/@type)}</concat_idref_type>
where not($small_relation_target_id_type_concat =
$largelex_relation_target_id_type_concat/text())
return $smalllexrelation/..
}
</relations>
(: Right here comes the output :)
return
<LG>
<lexitems>{
for $outlexitems in $alllexitems/lexitem
return
<lexitem lexid="{$outlexitems/@lexid}"
type="{$outlexitems/@type}">
{$outlexitems/text()}
</lexitem>
}
</lexitems>
{$allrelations}
{
(: Process everything else :)
for $everythingelse in $smalllex/LG/knowledge
return $everythingelse}
</LG>