Skip to content

Commit 506d7eb

Browse files
author
Javier
committed
added a tool to dump the dictionary with the estimation of the number of triples per term
1 parent ec01e9f commit 506d7eb

2 files changed

Lines changed: 192 additions & 1 deletion

File tree

libhdt/tests/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ logarr \
88
properties \
99
serd \
1010
streamtest \
11-
testmax
11+
testmax \
12+
dumpDictionary
1213
#cmp \
1314
#confm \
1415
#conops \

libhdt/tests/dumpDictionary.cpp

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#include <getopt.h>
2+
#include <cstdio>
3+
#include <fstream>
4+
#include <iostream>
5+
#include <sstream>
6+
#include <string>
7+
#include <sys/stat.h>
8+
9+
#include "../include/Dictionary.hpp"
10+
#include "../include/HDT.hpp"
11+
#include "../include/HDTEnums.hpp"
12+
#include "../include/HDTManager.hpp"
13+
#include "../include/Iterator.hpp"
14+
#include "../include/Triples.hpp"
15+
#include "../src/triples/TriplesList.hpp"
16+
17+
using namespace std;
18+
using namespace hdt;
19+
20+
void help() {
21+
cout << "$ dumpDictionary <inputHDTFile>" << endl;
22+
cout
23+
<< "Process the input HDT file and generate <termURI>;EstimatedTriples;<rdf:label>"
24+
<< endl << endl;
25+
cout << "\t-o\t\t\tDump also objects" << endl << endl;
26+
cout << "\t-u\t\t\tDump only URIs" << endl << endl;
27+
cout << "\t-p <exportPredicateFile>\t\t\texportPredicateFile (outPred.txt by default)" << endl << endl;
28+
cout << "\t-t <exportTermsFile>\t\t\exportTermsFile (outTerms.txt by default)" << endl << endl;
29+
cout << "\t-h\t\t\tThis help" << endl << endl;
30+
}
31+
32+
bool hasEnding (std::string const &fullString, std::string const &ending) {
33+
if (fullString.length() >= ending.length()) {
34+
return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending));
35+
} else {
36+
return false;
37+
}
38+
}
39+
40+
int main(int argc, char *argv[]) {
41+
int c;
42+
string inputFile;
43+
string outputFile;
44+
bool dumpObjects=false;
45+
bool onlyURIs=false;
46+
string outputPredFileString;
47+
string outputTermFileString;
48+
string label = "http://www.w3.org/2000/01/rdf-schema#label";
49+
50+
while ((c = getopt(argc, argv, "houp:t:")) != -1) {
51+
switch (c) {
52+
case 'h':
53+
help();
54+
break;
55+
case 'o':
56+
dumpObjects=true;
57+
break;
58+
case 'u':
59+
onlyURIs=true;
60+
break;
61+
case 'p':
62+
outputPredFileString = optarg;
63+
break;
64+
case 't':
65+
outputTermFileString = optarg;
66+
break;
67+
default:
68+
cout << "ERROR: Unknown option" << endl;
69+
help();
70+
return 1;
71+
}
72+
}
73+
74+
if (argc < 2) {
75+
cout << "ERROR: You must supply an input HDT File" << endl << endl;
76+
help();
77+
return 1;
78+
}
79+
inputFile = argv[optind];
80+
81+
// Load HDT file
82+
HDT *hdt = HDTManager::mapIndexedHDT(inputFile.c_str());
83+
84+
ostream *outP,*outT;
85+
ofstream outPF,outTF;
86+
87+
if(outputPredFileString!="") {
88+
outPF.open(outputPredFileString.c_str());
89+
outP = &outPF;
90+
} else {
91+
outP = &cout;
92+
}
93+
94+
if(outputTermFileString!="") {
95+
outTF.open(outputTermFileString.c_str());
96+
outT = &outTF;
97+
} else {
98+
outT = &cout;
99+
}
100+
101+
102+
/*
103+
* Compute over the dictionary
104+
*/
105+
106+
for (size_t i=0;i<hdt->getDictionary()->getNpredicates();i++){
107+
string pred = hdt->getDictionary()->idToString(i+1, PREDICATE);
108+
109+
IteratorTripleString *it1 = hdt->search("",pred.c_str(),"");
110+
int numResults = it1->estimatedNumResults();
111+
112+
// check label
113+
IteratorTripleString *itlabel = hdt->search(pred.c_str(),label.c_str(),"");
114+
string rdfsLabel="";
115+
while (itlabel->hasNext()){
116+
string label = itlabel->next()->getObject();
117+
if (hasEnding(label,"@en")||hasEnding(label,"\"")){
118+
119+
std::size_t found = label.find_last_of("\"");
120+
label = label.substr(1,found-1);
121+
if (rdfsLabel.length()>0){
122+
rdfsLabel=rdfsLabel+" "+label;
123+
}
124+
else
125+
rdfsLabel=label;
126+
}
127+
}
128+
delete itlabel;
129+
delete it1;
130+
131+
*outP<<pred<<";"<<numResults<<";"<<rdfsLabel<<endl;
132+
133+
134+
135+
}
136+
137+
for (size_t i=1;i<=hdt->getDictionary()->getNsubjects();i++){
138+
139+
string subj = hdt->getDictionary()->idToString(i, SUBJECT);
140+
141+
IteratorTripleString *it1 = hdt->search(subj.c_str(),"","");
142+
int numResults = it1->estimatedNumResults();
143+
if (i<hdt->getDictionary()->getNshared()){
144+
IteratorTripleString* it2 = hdt->search("","",subj.c_str());
145+
numResults+=it2->estimatedNumResults();
146+
}
147+
148+
// check label
149+
IteratorTripleString *itlabel = hdt->search(subj.c_str(),label.c_str(),"");
150+
string rdfsLabel="";
151+
while (itlabel->hasNext()){
152+
string label = itlabel->next()->getObject();
153+
if (hasEnding(label,"@en")||hasEnding(label,"\"")){
154+
155+
std::size_t found = label.find_last_of("\"");
156+
label = label.substr(1,found-1);
157+
if (rdfsLabel.length()>0){
158+
rdfsLabel=rdfsLabel+" "+label;
159+
}
160+
else
161+
rdfsLabel=label;
162+
}
163+
}
164+
delete itlabel;
165+
delete it1;
166+
167+
168+
169+
*outT<<subj<<";"<<numResults<<";"<<rdfsLabel<<endl;
170+
171+
}
172+
if (dumpObjects){
173+
for (int i=hdt->getDictionary()->getNshared();i<hdt->getDictionary()->getNobjects();i++){
174+
string obj = hdt->getDictionary()->idToString(i+1, OBJECT);
175+
176+
IteratorTripleString *it1 = hdt->search("","",obj.c_str());
177+
int numResults = it1->estimatedNumResults();
178+
if (!onlyURIs||obj.at(0)!='"'){
179+
*outT<<obj<<";"<<numResults<<endl;
180+
}
181+
delete it1;
182+
183+
}
184+
}
185+
outTF.close();
186+
outPF.close();
187+
188+
189+
delete hdt;
190+
}

0 commit comments

Comments
 (0)