to make a mirror, you follow 5 simples steps.
you must install virtuoso database.
and change virtuoso.ini to read files
DirsAllowed = ., /path/files/script
1º download files from dbpedia with command
wget -r -np -nd -nc -A'*.nt.bz2' http://downloads.dbpedia.org/3.8/en/
2º decompress files
for i in `ls ./download/*.bz2` ; do
bzcat $i | gzip --fast > ${i%.bz2}.gz && mv ./download/*.gz ./decompress/ && rm $i ;
done &
3º deletes triplets more long 1024 chars.
cd decompress;
for i in *.gz ; do
echo -n "cleaning $i..."
zcat $i | grep -v -E '^<.+> <.+> <.{1025,}> \.$' > $i.clean
mv *clean ../fixed/
echo "done."
done
4º split files in small files.
cd fixed
for i in *clean ; do
split_size=$(stat -c%s "$i")
echo "Size = $split_size"
echo -n "slipting $i..."
if [ $split_size -gt 5000000 ]; then
echo "File is large."
echo "Performing split on file $i"
split -a 10 -l 50000 $i "split/$i" ;
else
mv $i split/ ;
fi
done;
5º execute insert command in virtuoso database
echo "creating load statement"
cd slipt
for file in `ls *`
do
load_target="/home/user/split/$file"
load_query="EXEC=TTLP_MT(file_to_string_output('$load_target'), '', 'http://dbpedia.org', 255);commit WORK;"
isql-vt 1111 dba dba "$load_query"
# echo "$virt_isql" "$virt_port" "$virt_userName" "$virt_passWord" "$load_query"
if [ $? -ne 1 ]; then
#echo "execution ok. move data"
ls split/ | wc -l
mv "$load_target" process-files/
echo "procesando $file"
else
echo "error to process file $file"
exit;
fi
done;
echo "done"