Monday, September 2, 2013

make a mirror dbpedia in 5 steps

to make a mirror, you follow 5 simples steps. you must install virtuoso database. and change virtuoso.ini to read files
DirsAllowed              = ., /path/files/script 
1º download files from dbpedia with command
wget -r -np -nd -nc -A'*.nt.bz2' http://downloads.dbpedia.org/3.8/en/
2º decompress files
for i in `ls ./download/*.bz2` ; do
  bzcat $i | gzip --fast > ${i%.bz2}.gz && mv ./download/*.gz ./decompress/ && rm $i ;
done &
3º deletes triplets more long 1024 chars.
cd decompress;
for i in *.gz  ; do
  echo -n "cleaning $i..."
  zcat $i | grep -v -E '^<.+> <.+> <.{1025,}> \.$'  > $i.clean
  mv *clean ../fixed/
  echo "done."
done
4º split files in small files.
cd fixed
for i in *clean ; do

  split_size=$(stat -c%s "$i")

  echo "Size = $split_size"

  echo -n "slipting  $i..."

  if [ $split_size -gt 5000000 ]; then
        echo "File is large."

        echo "Performing split on file $i"

        split -a 10 -l 50000 $i "split/$i" ;
  else
        mv $i split/ ;
   fi

done;
5º execute insert command in virtuoso database
echo "creating load statement"

cd  slipt

for file in `ls *`
do
        load_target="/home/user/split/$file"
        load_query="EXEC=TTLP_MT(file_to_string_output('$load_target'), '', 'http://dbpedia.org', 255);commit WORK;"
        isql-vt 1111  dba dba  "$load_query"
     #  echo "$virt_isql" "$virt_port" "$virt_userName" "$virt_passWord" "$load_query"
 if [ $?  -ne 1 ]; then
  #echo "execution ok. move data"
  ls  split/ | wc -l
  mv "$load_target"  process-files/
  echo "procesando $file"
 else
   echo "error to process file $file"
   exit;
 fi
done;
echo "done"