Merging two lexicon graphs

(: XQUERY for processing LG data :)
(: Thorsten Trippel August 2004 :)

declare namespace my="my-functions.uri";
declare variable $largelexinfile as xs:string external;
declare variable $smalllexinfile as xs:string external;

(: Function disambiguate-id: tests if an id is already 
   in use somewhere else,  :)
(: and if so adds as many letters a to make it unique :)

declare function my:disambiguate-id 
($ambiguousid as xs:string, $largelexitems as node()* ) 
as xs:string
  {
  if ($ambiguousid = $largelexitems/@lexid) then 
    my:disambiguate-id(concat($ambiguousid,"a"),$largelexitems)
  else ($ambiguousid) 
  };

(: Transformation of the lexicon: 
   Takes a large LG lexicon and a smalllexicon. :)
(: Every element of the small one is tested against 
   the large one and if not already in there is added. :)
(: Before the addition the ids are adjusted. 
   The references from the relations are adjusted as well :)
(: Every lexitem of the large LG lexicon is copied if it is 
   not part of the small lexicon :)
(: roles of lexicons can be interchanged, with consequences 
   on performance, maybe :)


(: Definition of the lexicon files:)
let $largelex := doc($largelexinfile), (:LG_complete.xml :)
	$smalllex := doc($smalllexinfile) (:LG_to_be_merged.xml:)

(: This is the lexitems from the large lexicon :)

let $largelexitems := $largelex/LG/lexitems/lexitem

(: These variable is used in processing the relations :)
let $largelexrelations := $largelex/LG/relations/relation
let $largelex_relation_target_id_type_concat := 
    for $aux11 in $largelexrelations
    return <concat_idref_type>{
	concat($aux11/target/@idref, 
	$aux11/@type)}</concat_idref_type>

(: This is the processing of the lexical items :)
(: First all lexical items from the small lexicon are tagged 
	if they are already in the large one; 
        types are not taken into account here :)

let $potentiallexitemsmall := 
      <all_lexitems_smalllex>{
      (: Process all elements of the small lexicon 
         into ones that are not redundant and others 
         that could be candidates for redundancies :)

      for $potentiallexitem in $smalllex/LG/lexitems/lexitem
      return 
       if ($potentiallexitem/text() = 
              $largelexitems/text()) then  
        let $concataux := 
         for $largelexitems_test in $largelexitems
	 where $largelexitems_test/text()=
               $potentiallexitem/text()
	 return 
           <concataux>
	    {
           concat($largelexitems_test/text(),
                     $largelexitems_test/@type)}
           </concataux>
        return 
	 if (concat($potentiallexitem/text(),
           $potentiallexitem/@type)=$concataux) then 
	   for $largelexitems_test in $largelexitems
	   where 
             $largelexitems_test/text()= 
                     $potentiallexitem/text() and 
             $largelexitems_test/@type = 
                     $potentiallexitem/@type
	   return 
		<redundant>{<lexitem 
		  type="{$potentiallexitem/@type}" 
		  lexid="{$largelexitems_test/@lexid}" 
		  orglexid="{$potentiallexitem/@lexid}"
		>
		{$potentiallexitem/text()}</lexitem>}
		</redundant>
	     else  
		<nonredundant>
                  {
                   <lexitem 
		     type="{$potentiallexitem/@type}" 
		     lexid=
			"{my:disambiguate-id($potentiallexitem/@lexid,
                          $largelexitems)}" 
		     orglexid="{$potentiallexitem/@lexid}"
		   >
		   {$potentiallexitem/text()}</lexitem>}
		</nonredundant>
      else 
         <nonredundant>
          {<lexitem 
	    type="{$potentiallexitem/@type}" 
	    lexid="{my:disambiguate-id($potentiallexitem/@lexid,
              $largelexitems)}" 
            orglexid="{$potentiallexitem/@lexid}">
           {$potentiallexitem/text()}
           </lexitem>
	  }
           </nonredundant>
        }
 </all_lexitems_smalllex>

(: Here the lexitems that are definitively not identical are 
processed :)

let $alllexitems := 
  <lexitems>  
    {
    (: Select lexitems having non identical texts  
       and adjust identifiers :)
    for $nonredundant in 
        $potentiallexitemsmall/nonredundant/lexitem
    return $nonredundant
    }
    {
  (: Here the typing is applied but only for the items that 
     might be candidates, :)
  (: i.e. their content is in both lexicons :)
    for $largelexitem in $largelexitems
    return $largelexitem
    }
  </lexitems>

(: In the smalllexicon the identifiers in the relations 
   need to be adjusted as well :)
let $modifiedrelations :=
  <relations>{
    for $target in $smalllex/LG/relations/relation/target
    return 
      <relation type="{$target/../@type}" >
      {
      if ($potentiallexitemsmall//lexitem/@orglexid = 
          $target/@idref) then 
        for $alllexitems_4rel in $potentiallexitemsmall//lexitem
	where $alllexitems_4rel/@orglexid = $target/@idref
	return <target idref="{$alllexitems_4rel/@lexid}" 
		       orgidref="{$target/@idref}"/>
        else <target idref="{$target/@idref}" 
		     orgidref="unique"/>
      }
      {
      for $source in $target/../source
      return 
	if ($potentiallexitemsmall//lexitem/@orglexid = 
            $source/@idref)
        then 
	  for $alllexitems_4rel in 
              $potentiallexitemsmall//lexitem
	  where $alllexitems_4rel/@orglexid = $source/@idref
	  return <source idref="{$alllexitems_4rel/@lexid}" 
		     orgidref="{$source/@idref}"/>
	  else <source idref="{$source/@idref}" 
		       orgidref="unique"/>
		}
	</relation>
      }
  </relations>

(: With the adjusted relations the relations can 
   now be processed :)

let $allrelations := 
    <relations>
      {
	(: First the intersection of both lexicons :)
	for $largelexrelation in $largelexrelations
	let $smalllex_intersect_rel := 
            $modifiedrelations/relation

	let $large_relation_target_id_type_concat := 
	    concat($largelexrelation/target/@idref,
                   $largelexrelation/@type)
	let $smalllex_relation_target_id_type_concat :=
	    for $smalllex_concat_aux in 
                 $smalllex_intersect_rel/target
	    return <concat_aux2>
	    {concat($smalllex_concat_aux/@idref,
                    $smalllex_concat_aux/../@type)}
	    </concat_aux2>
	where $large_relation_target_id_type_concat = 
              $smalllex_relation_target_id_type_concat
(: This is the intersection of the relations from the target 
   perspective :)
	return 
          <relation type="{$largelexrelation/@type}">
	     <target idref="{$largelexrelation/target/@idref}"/>
               {
(: Now the source items need to be checked for redundancies :)
	       let $idrefs := 
		 for $aux1 in 
                     $smalllex_intersect_rel/source/@idref,
		     $aux2 in $largelexrelation/source/@idref
		 return <source>{$aux1}{$aux2}</source>
	       return 
		 for $aux3 in distinct-values($idrefs/@idref)
		 return <source idref="{$aux3}"/>		
	       }
	  </relation>
        }
	{
	(: Now the ones only present in the large lexicon :)	
	for $largelexrelation in $largelexrelations
	let $large_relation_target_id_type_concat := 
	    concat($largelexrelation/target/@idref,
                   $largelexrelation/@type) 
	let $small_relation_target_id_type_concat := 
	    for $aux4 in $modifiedrelations/relation/target
	    return 
              <concat_idref_type>
              {
	      concat($aux4/@idref,$aux4/../@type)}
              </concat_idref_type> 
	where not($small_relation_target_id_type_concat/text() =
                    $large_relation_target_id_type_concat)
	return $largelexrelation
}
{
(: Finally the ones only in the small lexicon :)
   for $smalllexrelation in $modifiedrelations/relation/target 
   let $small_relation_target_id_type_concat := 
       concat($smalllexrelation/@idref,
              $smalllexrelation/../@type)
   let $largelex_relation_target_id_type_concat := 
       for $aux5 in $largelexrelations
       return 
         <concat_idref_type>{
	   concat($aux5/target/@idref, 
		$aux5/@type)}</concat_idref_type>
   where not($small_relation_target_id_type_concat = 
             $largelex_relation_target_id_type_concat/text())
   return $smalllexrelation/.. 
   }
 </relations>
(: Right here comes the output :)
return
  <LG>
    <lexitems>{
      for $outlexitems in $alllexitems/lexitem
      return 
        <lexitem lexid="{$outlexitems/@lexid}" 
             type="{$outlexitems/@type}">
	 {$outlexitems/text()}
        </lexitem>
        }
    </lexitems>
{$allrelations}
{
(: Process everything else :)
	for $everythingelse in $smalllex/LG/knowledge 
	 return $everythingelse} 
</LG>
Thorsten Trippel 2006-11-18