-
Notifications
You must be signed in to change notification settings - Fork 67
Expand file tree
/
Copy pathrdf2hdtcat.sh
More file actions
106 lines (96 loc) · 3.16 KB
/
rdf2hdtcat.sh
File metadata and controls
106 lines (96 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
function showhelp {
echo
echo "Script to serialize a big RDF file in n-triples format into HDT"
echo "It splits the file in N parts, compress each one with rdf2hdt, and merges them iteratively with hdtCat."
echo
echo "Usage $0 [OPTION]"
echo
echo " -c, --catscript location of hdtCat script (assuming it's in PATH by by default)"
echo " -i, --input input file (input.rdf by default)"
echo " -h, --help shows this help and exits"
echo " -n, --number number of files to split FILE (2 by default)"
echo " -o, --output output file (output.hdt by default)"
echo " -p, --parallel number of threads to serialize RDF into HDT in parallel (1 by default)"
echo " -r, --rdf2hdt location of rdf2hdt script (assuming it's in PATH by default)"
echo
}
# Defaults
declare rdf2hdt="rdf2hdt.sh"
declare hdtCat="hdtCat.sh"
declare input="input.rdf"
declare -i lines
declare output="output.hdt"
declare -i splits=2
declare -i threads=1
getopt --test > /dev/null
if [[ $? -eq 4 ]]; then
# enhanced getopt works
OPTIONS=c:i:hn:o:p:r:
LONGOPTIONS=cat:,input:,help,number:,output:,parallel:,rdf2hdt:
COMMAND=$(getopt -o $OPTIONS -l $LONGOPTIONS -n "$0" -- "$@")
if [[ $? -ne 0 ]]; then
exit 2
fi
eval set -- "$COMMAND"
else
echo "Enhanced getopt not supported. Brace yourself, this is not tested, but it should work :-)"
fi
while true; do
case "$1" in
-c|--cat)
hdtCat=$2
shift 2
;;
-i|--input)
input=$2
shift 2
;;
-n|--number)
splits=$2
shift 2
;;
-o|--output)
output=$2
shift 2
;;
-p|--parallel)
threads=$2
shift 2
;;
-r|--rdf2hdt)
rdf2hdt=$2
shift 2
;;
--)
shift
break
;;
*)
showhelp
exit 0
;;
esac
done
total_lines=$(wc -l < $input)
lines=($total_lines+$splits-1)/$splits #Set number of lines rounding up
split -l $lines $input "$input"_split_
echo "***************************************************************"
echo "Serializing into HDT $splits files using $threads threads"
echo "***************************************************************"
echo -n "$input"_split_* | xargs -I{} -d' ' -P$threads $rdf2hdt -rdftype ntriples {} {}_"$splits".hdt
for (( i=$splits; i>1; i=i/2 )); do
echo "***************************************************************"
echo "Merging $i hdt files: " "$input"_split_*_"$i".hdt
echo "***************************************************************"
command='temp=${2%_*.hdt} ; '$hdtCat' $1 $2 ${1%_*.hdt}_${temp#*split_}_$0.hdt'
echo -n "$input"_split_*_"$i".hdt | xargs -d' ' -n2 -P$threads bash -c "$command" $((i/2))
done
echo "***************************************************************"
echo "Moving output to '$output' file"
echo "***************************************************************"
mv "$input"_split*_1.hdt "$output"
echo "***************************************************************"
echo "Cleaning up split files"
echo "***************************************************************"
rm "$input"_split_*