Hello Freddy,
thank you for your additional answer which sound like the optimal solution I'm searching for me. At the moment unfortunately I've only very poor knowledge about xslt transformations. I played around with some very simple examples using .xsl and .dtd which worked fine.
For my challenge there are at minimum two problems I've no idea how to solve:
1. Detecting the words to extract using a range of their positions
2. Adding one additional space between every extracted word, except the last one (see also comment in the resulting xml)
The following code shows a simplified example of an original xml file:
<?xml version="1.0" encoding="UTF-8" ?>
<extractcontent xmlns="
http://www.callassoftware.com/ns/pdftoolbox/1.0" xmlns:xsi="
http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="
http://www.callassoftware.com/ns/pdftoolbox/1.0 extractcontent.xsd">
<documents>
<document>
<pages>
<page id="0">
<words>
<word txt="This">
<parts>
<part tlh="2104.85" tlv="2336.36" trh="2104.85" trv="2342" blh="2117.31" blv="2336.36" brh="2117.31" brv="2342"></part>
<part tlh="2104.85" tlv="2342" trh="2104.85" trv="2347.84" blh="2117.31" blv="2342" brh="2117.31" brv="2347.84"></part>
<part tlh="2104.85" tlv="2347.84" trh="2104.85" trv="2353.48" blh="2117.31" blv="2347.84" brh="2117.31" brv="2353.48"></part>
<part tlh="2104.85" tlv="2353.48" trh="2104.85" trv="2362.43" blh="2117.31" blv="2353.48" brh="2117.31" brv="2362.43"></part>
<part tlh="2104.85" tlv="2362.43" trh="2104.85" trv="2368.66" blh="2117.31" blv="2362.43" brh="2117.31" brv="2368.66"></part>
<part tlh="2104.85" tlv="2368.66" trh="2104.85" trv="2374.5" blh="2117.31" blv="2368.66" brh="2117.31" brv="2374.5"></part>
</parts>
</word>
<word txt="is">
<parts>
<part tlh="2104.85" tlv="2377.42" trh="2104.85" trv="2382.67" blh="2117.31" blv="2377.42" brh="2117.31" brv="2382.67"></part>
<part tlh="2104.85" tlv="2382.67" trh="2104.85" trv="2385" blh="2117.31" blv="2382.67" brh="2117.31" brv="2385"></part>
<part tlh="2104.85" tlv="2385" trh="2104.85" trv="2390.83" blh="2117.31" blv="2385" brh="2117.31" brv="2390.83"></part>
</parts>
</word>
<word txt="not">
<parts>
<part tlh="2104.85" tlv="2393.75" trh="2104.85" trv="2399.39" blh="2117.31" blv="2393.75" brh="2117.31" brv="2399.39"></part>
<part tlh="2104.85" tlv="2399.39" trh="2104.85" trv="2405.03" blh="2117.31" blv="2399.39" brh="2117.31" brv="2405.03"></part>
<part tlh="2104.85" tlv="2405.03" trh="2104.85" trv="2411.06" blh="2117.31" blv="2405.03" brh="2117.31" brv="2411.06"></part>
<part tlh="2104.85" tlv="2411.06" trh="2104.85" trv="2416.9" blh="2117.31" blv="2411.06" brh="2117.31" brv="2416.9"></part>
<part tlh="2104.85" tlv="2416.9" trh="2104.85" trv="2423.12" blh="2117.31" blv="2416.9" brh="2117.31" brv="2423.12"></part>
<part tlh="2104.85" tlv="2423.12" trh="2104.85" trv="2425.45" blh="2117.31" blv="2423.12" brh="2117.31" brv="2425.45"></part>
<part tlh="2104.85" tlv="2425.45" trh="2104.85" trv="2431.09" blh="2117.31" blv="2425.45" brh="2117.31" brv="2431.09"></part>
<part tlh="2104.85" tlv="2431.09" trh="2104.85" trv="2433.42" blh="2117.31" blv="2431.09" brh="2117.31" brv="2433.42"></part>
<part tlh="2104.85" tlv="2433.42" trh="2104.85" trv="2439.45" blh="2117.31" blv="2433.42" brh="2117.31" brv="2439.45"></part>
<part tlh="2104.85" tlv="2439.45" trh="2104.85" trv="2445.29" blh="2117.31" blv="2439.45" brh="2117.31" brv="2445.29"></part>
<part tlh="2104.85" tlv="2445.29" trh="2104.85" trv="2450.93" blh="2117.31" blv="2445.29" brh="2117.31" brv="2450.93"></part>
<part tlh="2104.85" tlv="2450.93" trh="2104.85" trv="2457.15" blh="2117.31" blv="2450.93" brh="2117.31" brv="2457.15"></part>
<part tlh="2104.85" tlv="2457.15" trh="2104.85" trv="2463.18" blh="2117.31" blv="2457.15" brh="2117.31" brv="2463.18"></part>
<part tlh="2104.85" tlv="2463.18" trh="2104.85" trv="2466.68" blh="2117.31" blv="2463.18" brh="2117.31" brv="2466.68"></part>
<part tlh="2104.85" tlv="2466.68" trh="2104.85" trv="2469.59" blh="2117.31" blv="2466.68" brh="2117.31" brv="2469.59"></part>
</parts>
</word>
<word txt="interesting">
<parts>
<part tlh="2104.85" tlv="2472.51" trh="2104.85" trv="2478.15" blh="2117.31" blv="2472.51" brh="2117.31" brv="2478.15"></part>
<part tlh="2104.85" tlv="2478.15" trh="2104.85" trv="2483.4" blh="2117.31" blv="2478.15" brh="2117.31" brv="2483.4"></part>
<part tlh="2104.85" tlv="2483.4" trh="2104.85" trv="2489.24" blh="2117.31" blv="2483.4" brh="2117.31" brv="2489.24"></part>
<part tlh="2104.85" tlv="2489.24" trh="2104.85" trv="2498.2" blh="2117.31" blv="2489.24" brh="2117.31" brv="2498.2"></part>
<part tlh="2104.85" tlv="2498.2" trh="2104.85" trv="2500.53" blh="2117.31" blv="2498.2" brh="2117.31" brv="2500.53"></part>
<part tlh="2104.85" tlv="2500.53" trh="2104.85" trv="2506.17" blh="2117.31" blv="2500.53" brh="2117.31" brv="2506.17"></part>
<part tlh="2104.85" tlv="2506.17" trh="2104.85" trv="2512" blh="2117.31" blv="2506.17" brh="2117.31" brv="2512"></part>
<part tlh="2104.85" tlv="2512" trh="2104.85" trv="2518.23" blh="2117.31" blv="2512" brh="2117.31" brv="2518.23"></part>
<part tlh="2104.85" tlv="2518.23" trh="2104.85" trv="2524.26" blh="2117.31" blv="2518.23" brh="2117.31" brv="2524.26"></part>
</parts>
</word>
<word txt="for">
<parts>
<part tlh="2104.85" tlv="2527.18" trh="2104.85" trv="2529.51" blh="2117.31" blv="2527.18" brh="2117.31" brv="2529.51"></part>
<part tlh="2104.85" tlv="2529.51" trh="2104.85" trv="2535.15" blh="2117.31" blv="2529.51" brh="2117.31" brv="2535.15"></part>
<part tlh="2104.85" tlv="2535.15" trh="2104.85" trv="2540.4" blh="2117.31" blv="2535.15" brh="2117.31" brv="2540.4"></part>
<part tlh="2104.85" tlv="2540.4" trh="2104.85" trv="2546.03" blh="2117.31" blv="2540.4" brh="2117.31" brv="2546.03"></part>
<part tlh="2104.85" tlv="2546.03" trh="2104.85" trv="2552.26" blh="2117.31" blv="2546.03" brh="2117.31" brv="2552.26"></part>
<part tlh="2104.85" tlv="2552.26" trh="2104.85" trv="2558.29" blh="2117.31" blv="2552.26" brh="2117.31" brv="2558.29"></part>
</parts>
</word>
<word txt="me">
<parts>
<part tlh="2104.85" tlv="2561.21" trh="2104.85" trv="2567.43" blh="2117.31" blv="2561.21" brh="2117.31" brv="2567.43"></part>
<part tlh="2104.85" tlv="2567.43" trh="2104.85" trv="2569.76" blh="2117.31" blv="2567.43" brh="2117.31" brv="2569.76"></part>
<part tlh="2104.85" tlv="2569.76" trh="2104.85" trv="2575.4" blh="2117.31" blv="2569.76" brh="2117.31" brv="2575.4"></part>
<part tlh="2104.85" tlv="2575.4" trh="2104.85" trv="2578.9" blh="2117.31" blv="2575.4" brh="2117.31" brv="2578.9"></part>
<part tlh="2104.85" tlv="2578.9" trh="2104.85" trv="2581.23" blh="2117.31" blv="2578.9" brh="2117.31" brv="2581.23"></part>
<part tlh="2104.85" tlv="2581.23" trh="2104.85" trv="2587.26" blh="2117.31" blv="2581.23" brh="2117.31" brv="2587.26"></part>
</parts>
</word>
<word txt="This">
<parts>
<part tlh="149.891" tlv="4215.23" trh="149.891" trv="4209.45" blh="140.395" blv="4215.23" brh="140.395" brv="4209.45"></part>
<part tlh="149.891" tlv="4209.45" trh="149.891" trv="4205.15" blh="140.395" blv="4209.45" brh="140.395" brv="4205.15"></part>
<part tlh="149.891" tlv="4205.15" trh="149.891" trv="4200.71" blh="140.395" blv="4205.15" brh="140.395" brv="4200.71"></part>
<part tlh="149.891" tlv="4200.71" trh="149.891" trv="4196.71" blh="140.395" blv="4200.71" brh="140.395" brv="4196.71"></part>
<part tlh="149.891" tlv="4196.71" trh="149.891" trv="4192.11" blh="140.395" blv="4196.71" brh="140.395" brv="4192.11"></part>
<part tlh="149.891" tlv="4192.11" trh="149.891" trv="4187.67" blh="140.395" blv="4192.11" brh="140.395" brv="4187.67"></part>
</parts>
</word>
<word txt="is">
<parts>
<part tlh="149.891" tlv="4185.44" trh="149.891" trv="4179.81" blh="140.395" blv="4185.44" brh="140.395" brv="4179.81"></part>
<part tlh="149.891" tlv="4179.81" trh="149.891" trv="4178.03" blh="140.395" blv="4179.81" brh="140.395" brv="4178.03"></part>
<part tlh="149.891" tlv="4178.03" trh="149.891" trv="4174.03" blh="140.395" blv="4178.03" brh="140.395" brv="4174.03"></part>
<part tlh="149.891" tlv="4174.03" trh="149.891" trv="4169.29" blh="140.395" blv="4174.03" brh="140.395" brv="4169.29"></part>
<part tlh="149.891" tlv="4169.29" trh="149.891" trv="4167.51" blh="140.395" blv="4169.29" brh="140.395" brv="4167.51"></part>
<part tlh="149.891" tlv="4167.51" trh="149.891" trv="4163.22" blh="140.395" blv="4167.51" brh="140.395" brv="4163.22"></part>
<part tlh="149.891" tlv="4163.22" trh="149.891" trv="4159.22" blh="140.395" blv="4163.22" brh="140.395" brv="4159.22"></part>
</parts>
</word>
<word txt="what">
<parts>
<part tlh="149.891" tlv="4156.99" trh="149.891" trv="4152.55" blh="140.395" blv="4156.99" brh="140.395" brv="4152.55"></part>
<part tlh="149.891" tlv="4152.55" trh="149.891" trv="4150.32" blh="140.395" blv="4152.55" brh="140.395" brv="4150.32"></part>
<part tlh="149.891" tlv="4150.32" trh="149.891" trv="4145.87" blh="140.395" blv="4150.32" brh="140.395" brv="4145.87"></part>
</parts>
</word>
<word txt="I">
<parts>
<part tlh="149.891" tlv="4143.65" trh="149.891" trv="4137.87" blh="140.395" blv="4143.65" brh="140.395" brv="4137.87"></part>
<part tlh="149.891" tlv="4137.87" trh="149.891" trv="4133.58" blh="140.395" blv="4137.87" brh="140.395" brv="4133.58"></part>
<part tlh="149.891" tlv="4133.58" trh="149.891" trv="4129.28" blh="140.395" blv="4133.58" brh="140.395" brv="4129.28"></part>
<part tlh="149.891" tlv="4129.28" trh="149.891" trv="4124.54" blh="140.395" blv="4129.28" brh="140.395" brv="4124.54"></part>
<part tlh="149.891" tlv="4124.54" trh="149.891" trv="4120.24" blh="140.395" blv="4124.54" brh="140.395" brv="4120.24"></part>
<part tlh="149.891" tlv="4120.24" trh="149.891" trv="4117.58" blh="140.395" blv="4120.24" brh="140.395" brv="4117.58"></part>
</parts>
</word>
<word txt="need">
<parts>
<part tlh="149.891" tlv="4115.35" trh="149.891" trv="4109.58" blh="140.395" blv="4115.35" brh="140.395" brv="4109.58"></part>
<part tlh="149.891" tlv="4109.58" trh="149.891" trv="4105.28" blh="140.395" blv="4109.58" brh="140.395" brv="4105.28"></part>
<part tlh="149.891" tlv="4105.28" trh="149.891" trv="4102.62" blh="140.395" blv="4105.28" brh="140.395" brv="4102.62"></part>
<part tlh="149.891" tlv="4102.62" trh="149.891" trv="4097.87" blh="140.395" blv="4102.62" brh="140.395" brv="4097.87"></part>
</parts>
</word>
<word txt="AndIneedthis111">
<parts>
<part tlh="166.261" tlv="4278.19" trh="166.261" trv="4273.74" blh="156.765" blv="4278.19" brh="156.765" brv="4273.74"></part>
<part tlh="166.261" tlv="4273.74" trh="166.261" trv="4269.29" blh="156.765" blv="4273.74" brh="156.765" brv="4269.29"></part>
<part tlh="166.261" tlv="4269.29" trh="166.261" trv="4264.84" blh="156.765" blv="4269.29" brh="156.765" brv="4264.84"></part>
<part tlh="166.261" tlv="4264.84" trh="166.261" trv="4260.39" blh="156.765" blv="4264.84" brh="156.765" brv="4260.39"></part>
<part tlh="166.261" tlv="4260.39" trh="166.261" trv="4255.95" blh="156.765" blv="4260.39" brh="156.765" brv="4255.95"></part>
<part tlh="166.261" tlv="4255.95" trh="166.261" trv="4251.5" blh="156.765" blv="4255.95" brh="156.765" brv="4251.5"></part>
<part tlh="166.261" tlv="4251.5" trh="166.261" trv="4247.05" blh="156.765" blv="4251.5" brh="156.765" brv="4247.05"></part>
<part tlh="166.261" tlv="4247.05" trh="166.261" trv="4242.6" blh="156.765" blv="4247.05" brh="156.765" brv="4242.6"></part>
</parts>
</word>
<word txt="IDontNeedThisWordWithAFixPositionBetweenOtherNeededWords">
<parts>
<part tlh="166.261" tlv="4240.38" trh="166.261" trv="4237.71" blh="156.765" blv="4240.38" brh="156.765" brv="4237.71"></part>
</parts>
</word>
<word txt="AndIneedthis222">
<parts>
<part tlh="166.261" tlv="4235.49" trh="166.261" trv="4230.31" blh="156.765" blv="4235.49" brh="156.765" brv="4230.31"></part>
<part tlh="166.261" tlv="4230.31" trh="166.261" trv="4225.12" blh="156.765" blv="4230.31" brh="156.765" brv="4225.12"></part>
<part tlh="166.261" tlv="4225.12" trh="166.261" trv="4219.94" blh="156.765" blv="4225.12" brh="156.765" brv="4219.94"></part>
<part tlh="166.261" tlv="4219.94" trh="166.261" trv="4214.75" blh="156.765" blv="4219.94" brh="156.765" brv="4214.75"></part>
<part tlh="166.261" tlv="4214.75" trh="166.261" trv="4209.27" blh="156.765" blv="4214.75" brh="156.765" brv="4209.27"></part>
<part tlh="166.261" tlv="4209.27" trh="166.261" trv="4203.79" blh="156.765" blv="4209.27" brh="156.765" brv="4203.79"></part>
<part tlh="166.261" tlv="4203.79" trh="166.261" trv="4198.31" blh="156.765" blv="4203.79" brh="156.765" brv="4198.31"></part>
<part tlh="166.261" tlv="4198.31" trh="166.261" trv="4192.54" blh="156.765" blv="4198.31" brh="156.765" brv="4192.54"></part>
<part tlh="166.261" tlv="4192.54" trh="166.261" trv="4186.76" blh="156.765" blv="4192.54" brh="156.765" brv="4186.76"></part>
<part tlh="166.261" tlv="4186.76" trh="166.261" trv="4180.99" blh="156.765" blv="4186.76" brh="156.765" brv="4180.99"></part>
</parts>
</word>
</words>
</page>
</pages>
</document>
</documents>
</extractcontent>
The resulting xml file should be reduced like this:
<?xml version="1.0" encoding="UTF-8" ?>
<extractcontent xmlns="
http://www.callassoftware.com/ns/pdftoolbox/1.0" xmlns:xsi="
http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="
http://www.callassoftware.com/ns/pdftoolbox/1.0 extractcontent.xsd">
<TextExtracted1>
This is what I need<!-- Between each words one single space should be added, except the last one -->
</TextExtracted1>
<TextExtracted2>
AndIneedthis111
</TextExtracted2>
<TextExtracted3>
AndIneedthis222
</TextExtracted3>
</extractcontent>
Could you pls. give me some hints, how the .dtd and .xsl have to look?
Kind regards,
Thomas