Skip to content

Commit 8bbbff1

Browse files
committed
Rerun sentence correction preprocessing
1 parent e7dd9d4 commit 8bbbff1

2 files changed

Lines changed: 169 additions & 28 deletions

File tree

Lines changed: 134 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,134 @@
1-
{"dev": [42643, 38843, 30692, 30675, 30645, 19235, 7869, 7856, 7843, 7430, 6727, 6672, 5414, 4452, 4426, 3216, 2783], "test": [49123, 48763, 48676, 48357, 48257, 46910, 46858, 46839, 46144, 43778, 43726, 43649, 42051, 8658, 8636, 8628, 8612, 8597, 7560, 6829, 6104, 5640, 5267, 5047, 3155, 1892], "train": [219502, 219329, 217807, 216156, 192381, 188610, 188128, 188098, 188070, 188055, 187979, 187959, 173869, 161214, 161023, 159412, 159351, 158735, 158689, 156226, 154243, 150308, 150294, 150231, 138533, 123300, 123287, 122102, 121120, 102353, 93933, 93899, 93160, 91425, 80898, 77860, 76356, 76178, 74427, 73208, 71288, 70208, 70129, 70112, 70110, 69329, 69299, 69065, 69027, 68994, 58457, 55816, 35223, 35176, 35134, 32889, 32235, 32027, 30975, 30773, 12691, 12623, 12609, 12582, 12500, 12487, 10824, 9200, 8594, 8514, 8441, 7926, 7622, 7243, 5833, 5803, 5727]}
1+
{
2+
"dev": [
3+
42643,
4+
38843,
5+
30692,
6+
30675,
7+
30645,
8+
7869,
9+
7856,
10+
7843,
11+
7430,
12+
6727,
13+
6672,
14+
5414,
15+
4452,
16+
4426,
17+
3216,
18+
2783
19+
],
20+
"test": [
21+
49123,
22+
48763,
23+
48676,
24+
48357,
25+
48257,
26+
46910,
27+
46858,
28+
46839,
29+
46144,
30+
43778,
31+
43726,
32+
43649,
33+
42051,
34+
8658,
35+
8636,
36+
8628,
37+
8612,
38+
8597,
39+
7560,
40+
6829,
41+
6104,
42+
5640,
43+
5267,
44+
5047,
45+
3155,
46+
1892
47+
],
48+
"train": [
49+
219502,
50+
219329,
51+
217807,
52+
216156,
53+
192381,
54+
188610,
55+
188128,
56+
188098,
57+
188070,
58+
188055,
59+
187979,
60+
187959,
61+
179109,
62+
179107,
63+
179104,
64+
173869,
65+
161214,
66+
161023,
67+
159412,
68+
159351,
69+
158735,
70+
158689,
71+
156226,
72+
154243,
73+
150308,
74+
150294,
75+
150231,
76+
138533,
77+
123300,
78+
123287,
79+
122102,
80+
121120,
81+
102353,
82+
93933,
83+
93899,
84+
93160,
85+
91425,
86+
80898,
87+
77860,
88+
76356,
89+
76178,
90+
74427,
91+
73208,
92+
73188,
93+
71288,
94+
70208,
95+
70129,
96+
70110,
97+
70063,
98+
70043,
99+
70024,
100+
69329,
101+
69299,
102+
69065,
103+
69027,
104+
68994,
105+
58457,
106+
55816,
107+
35223,
108+
35176,
109+
35134,
110+
32889,
111+
32235,
112+
32027,
113+
30975,
114+
30773,
115+
12691,
116+
12623,
117+
12609,
118+
12582,
119+
12500,
120+
12487,
121+
10824,
122+
9489,
123+
9200,
124+
8594,
125+
8514,
126+
8441,
127+
7926,
128+
7622,
129+
7243,
130+
5833,
131+
5803,
132+
5727
133+
]
134+
}

scripts/sentence_correction_preprocessing.ipynb

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 2,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -12,12 +12,12 @@
1212
"import pandas as pd\n",
1313
"import numpy as np\n",
1414
"import text_extensions_for_pandas as tp\n",
15-
"from correct_label_errors import Dataset"
15+
"from download_and_correct_corpus import Dataset"
1616
]
1717
},
1818
{
1919
"cell_type": "code",
20-
"execution_count": 2,
20+
"execution_count": 3,
2121
"metadata": {},
2222
"outputs": [],
2323
"source": [
@@ -31,7 +31,7 @@
3131
},
3232
{
3333
"cell_type": "code",
34-
"execution_count": 3,
34+
"execution_count": 4,
3535
"metadata": {},
3636
"outputs": [],
3737
"source": [
@@ -50,7 +50,7 @@
5050
},
5151
{
5252
"cell_type": "code",
53-
"execution_count": 4,
53+
"execution_count": 5,
5454
"metadata": {},
5555
"outputs": [],
5656
"source": [
@@ -66,38 +66,37 @@
6666
},
6767
{
6868
"cell_type": "code",
69-
"execution_count": 5,
69+
"execution_count": 6,
7070
"metadata": {},
7171
"outputs": [
7272
{
73-
"name": "stderr",
73+
"name": "stdout",
7474
"output_type": "stream",
7575
"text": [
76-
"[WARNING] Invalid span (1130, 1140]: 'St. Louis'\n",
77-
"[WARNING] Could not find [1131,1140) 'St. Louis' \n",
78-
"[WARNING] Could not find [1131,1140) 'St. Louis' \n"
76+
"Nothing to append here! Check test, 20 again\n",
77+
"Nothing to append here! Check test, 30 again\n"
7978
]
8079
},
8180
{
82-
"name": "stdout",
81+
"name": "stderr",
8382
"output_type": "stream",
8483
"text": [
85-
"Nothing to append here! Check dev, 1 again\n",
86-
"The correct_span did not match lines, using corpus span instead at dev, 3\n",
87-
"The correct_span did not match lines, using corpus span instead at dev, 4\n",
88-
"Nothing to append here! Check dev, 21 again\n",
89-
"Nothing to append here! Check dev, 22 again\n",
90-
"The correct_span did not match lines, using corpus span instead at test, 14\n",
91-
"Nothing to append here! Check train, 142 again\n",
92-
"Nothing to append here! Check train, 143 again\n"
84+
"[WARNING] Could not find [76, 107): 'National Basketball Association': No span begins with 76\n"
9385
]
9486
},
9587
{
96-
"name": "stderr",
88+
"name": "stdout",
9789
"output_type": "stream",
9890
"text": [
99-
"[WARNING] Could not find [28, 31): 'AFL'\n",
100-
"[WARNING] Could not find [11, 21): 'AUSTRALIAN'\n"
91+
"Nothing to append here! Check train, 37 again\n",
92+
"Nothing to append here! Check train, 38 again\n",
93+
"Nothing to append here! Check train, 39 again\n",
94+
"Nothing to append here! Check train, 76 again\n",
95+
"Nothing to append here! Check train, 77 again\n",
96+
"Nothing to append here! Check train, 78 again\n",
97+
"Nothing to append here! Check train, 107 again\n",
98+
"Nothing to append here! Check train, 108 again\n",
99+
"Nothing to append here! Check train, 111 again\n"
101100
]
102101
}
103102
],
@@ -137,7 +136,7 @@
137136
},
138137
{
139138
"cell_type": "code",
140-
"execution_count": 6,
139+
"execution_count": 7,
141140
"metadata": {},
142141
"outputs": [],
143142
"source": [
@@ -148,7 +147,7 @@
148147
},
149148
{
150149
"cell_type": "code",
151-
"execution_count": 7,
150+
"execution_count": 8,
152151
"metadata": {},
153152
"outputs": [
154153
{
@@ -209,6 +208,9 @@
209208
" 188055,\n",
210209
" 187979,\n",
211210
" 187959,\n",
211+
" 179109,\n",
212+
" 179107,\n",
213+
" 179104,\n",
212214
" 173869,\n",
213215
" 161214,\n",
214216
" 161023,\n",
@@ -237,9 +239,14 @@
237239
" 76178,\n",
238240
" 74427,\n",
239241
" 73208,\n",
242+
" 73188,\n",
240243
" 71288,\n",
241244
" 70208,\n",
242245
" 70129,\n",
246+
" 70110,\n",
247+
" 70063,\n",
248+
" 70043,\n",
249+
" 70024,\n",
243250
" 69329,\n",
244251
" 69299,\n",
245252
" 69065,\n",
@@ -262,6 +269,7 @@
262269
" 12500,\n",
263270
" 12487,\n",
264271
" 10824,\n",
272+
" 9489,\n",
265273
" 9200,\n",
266274
" 8594,\n",
267275
" 8514,\n",
@@ -282,12 +290,12 @@
282290
},
283291
{
284292
"cell_type": "code",
285-
"execution_count": 16,
293+
"execution_count": 10,
286294
"metadata": {},
287295
"outputs": [],
288296
"source": [
289297
"import json\n",
290-
"json = json.dumps(lines_to_delete)\n",
298+
"json = json.dumps(lines_to_delete, indent=4, sort_keys=True)\n",
291299
"f = open(\"../corrected_labels/sentence_corrections.json\",\"w\")\n",
292300
"f.write(json)\n",
293301
"f.close()"
@@ -324,7 +332,7 @@
324332
"name": "python",
325333
"nbconvert_exporter": "python",
326334
"pygments_lexer": "ipython3",
327-
"version": "3.8.6"
335+
"version": "3.8.5"
328336
}
329337
},
330338
"nbformat": 4,

0 commit comments

Comments
 (0)