Skip to content

Commit 95b39b2

Browse files
committed
Improve .npy file writing support
* Overwrite, never append new .npy file to end of existing one * Pad header to properly align to 64 byte multiples * Force flush() to disk every recordBufferSize records * Update shape field in header on every flush() * Ensure sure shape field doesn't exceed header on update * Factor out shape string construction, fix empty array shape * General cleanup
1 parent e3c12aa commit 95b39b2

2 files changed

Lines changed: 97 additions & 45 deletions

File tree

Source/Plugins/BinaryWriter/NpyFile.cpp

Lines changed: 86 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ GNU General Public License for more details.
1919
You should have received a copy of the GNU General Public License
2020
along with this program. If not, see <http://www.gnu.org/licenses/>.
2121
22+
Specification of the .npy file format is at:
23+
24+
http://www.numpy.org/neps/nep-0001-npy-format.html
25+
26+
Python implementation is at:
27+
28+
https://github.com/numpy/numpy/blob/master/numpy/lib/format.py
29+
2230
*/
2331

2432
#include "NpyFile.h"
@@ -27,7 +35,6 @@ using namespace BinaryRecordingEngine;
2735

2836
NpyFile::NpyFile(String path, const Array<NpyType>& typeList)
2937
{
30-
3138
m_dim1 = 1;
3239
m_dim2 = 1;
3340

@@ -44,12 +51,10 @@ NpyFile::NpyFile(String path, const Array<NpyType>& typeList)
4451
if (!openFile(path))
4552
return;
4653
writeHeader(typeList);
47-
4854
}
4955

5056
NpyFile::NpyFile(String path, NpyType type, unsigned int dim)
5157
{
52-
5358
if (!openFile(path))
5459
return;
5560

@@ -58,7 +63,6 @@ NpyFile::NpyFile(String path, NpyType type, unsigned int dim)
5863
m_dim1 = dim;
5964
m_dim2 = type.getTypeLength();
6065
writeHeader(typeList);
61-
6266
}
6367

6468
bool NpyFile::openFile(String path)
@@ -67,9 +71,13 @@ bool NpyFile::openFile(String path)
6771
Result res = file.create();
6872
if (res.failed())
6973
{
70-
std::cerr << "Error creating file " << path << ":" << res.getErrorMessage() << std::endl;
74+
std::cerr << "Error creating file " << path << ":" << res.getErrorMessage()
75+
<< std::endl;
7176
return false;
7277
}
78+
file.deleteFile(); // overwrite, never append a new .npy file to end of an existing one
79+
// output stream buffer size defaults to 32768 bytes, but is irrelevant because
80+
// each updateHeader() call triggers a m_file->flush() to disk:
7381
m_file = file.createOutputStream();
7482
if (!m_file)
7583
return false;
@@ -78,69 +86,103 @@ bool NpyFile::openFile(String path)
7886
return true;
7987
}
8088

89+
String NpyFile::getShapeString()
90+
{
91+
String shape;
92+
shape.preallocateBytes(32);
93+
shape = "(";
94+
shape += String(m_recordCount) + ",";
95+
if (m_dim1 > 1)
96+
{
97+
shape += " " + String(m_dim1) + ",";
98+
}
99+
if (m_dim2 > 1)
100+
shape += " " + String(m_dim2);
101+
shape += "), }";
102+
return shape;
103+
}
104+
81105
void NpyFile::writeHeader(const Array<NpyType>& typeList)
82106
{
107+
uint8 magicNum = 0x93;
108+
String magicStr = "NUMPY";
109+
uint16 ver = 0x0001;
110+
// magic = magic number + magic string + magic version
111+
int magicLen = sizeof(uint8) + magicStr.getNumBytesAsUTF8() + sizeof(uint16);
112+
int nbytesAlign = 64; // header should use an integer multiple of this many bytes
113+
83114
bool multiValue = typeList.size() > 1;
84-
String header = "{'descr': ";
85-
header.preallocateBytes(100);
115+
String strHeader;
116+
strHeader.preallocateBytes(128);
117+
strHeader = "{'descr': ";
86118

87119
if (multiValue)
88-
header += "[";
120+
strHeader += "[";
89121

90122
int nTypes = typeList.size();
91123

92124
for (int i = 0; i < nTypes; i++)
93125
{
94126
NpyType& type = typeList.getReference(i);
95-
if (i > 0) header += ", ";
127+
if (i > 0) strHeader += ", ";
96128
if (multiValue)
97-
header += "('" + type.getName() + "', '" + type.getTypeString() + "', (" + String(type.getTypeLength()) + ",))";
129+
strHeader += "('" + type.getName() + "', '" + type.getTypeString()
130+
+ "', (" + String(type.getTypeLength()) + ",))";
98131
else
99-
header += "'" + type.getTypeString() + "'";
132+
strHeader += "'" + type.getTypeString() + "'";
100133
}
101134
if (multiValue)
102-
header += "]";
103-
header += ", 'fortran_order': False, 'shape': ";
104-
105-
m_countPos = header.length() + 10;
106-
header += "(1,), }";
107-
int padding = (int((header.length() + 30) / 16) + 1) * 16;
108-
header = header.paddedRight(' ', padding);
109-
header += '\n';
135+
strHeader += "]";
136+
strHeader += ", 'fortran_order': False, 'shape': ";
137+
138+
// save byte offset of shape field in .npy file
139+
// magic + header length field + current string header length:
140+
m_shapePos = magicLen + sizeof(uint16) + strHeader.length();
141+
strHeader += getShapeString(); // inits to 0 records, i.e. 1st dim has length 0
142+
int baseHeaderLen = magicLen + sizeof(uint16) + strHeader.length() + 1; // +1 for newline
143+
int padlen = nbytesAlign - (baseHeaderLen % nbytesAlign);
144+
strHeader = strHeader.paddedRight(' ', strHeader.length() + padlen);
145+
strHeader += '\n';
146+
uint16 strHeaderLen = strHeader.length();
110147

111-
uint8 magicNum = 0x093;
112148
m_file->write(&magicNum, sizeof(uint8));
113-
String magic = "NUMPY";
114-
uint16 len = header.length();
115-
m_file->write(magic.toUTF8(), magic.getNumBytesAsUTF8());
116-
uint16 ver = 0x0001;
149+
m_file->write(magicStr.toUTF8(), magicStr.getNumBytesAsUTF8());
117150
m_file->write(&ver, sizeof(uint16));
118-
m_file->write(&len, sizeof(uint16));
119-
m_file->write(header.toUTF8(), len);
151+
m_file->write(&strHeaderLen, sizeof(uint16));
152+
m_file->write(strHeader.toUTF8(), strHeaderLen);
153+
m_headerLen = m_file->getPosition(); // total header length
154+
m_file->flush();
120155
}
121156

122-
NpyFile::~NpyFile()
157+
void NpyFile::updateHeader()
123158
{
124-
if (m_file->setPosition(m_countPos))
159+
// overwrite the shape part of the header - even without explicitly calling
160+
// m_file->flush(), overwriting seems to trigger a flush to disk,
161+
// while appending to end of file does not
162+
int currentPos = m_file->getPosition();
163+
if (m_file->setPosition(m_shapePos))
125164
{
126-
String newShape = "(";
127-
newShape.preallocateBytes(20);
128-
newShape += String(m_recordCount) + ",";
129-
if (m_dim1 > 1)
165+
String newShape = getShapeString();
166+
if (m_shapePos + newShape.getNumBytesAsUTF8() + 1 > m_headerLen) // +1 for newline
130167
{
131-
newShape += String(m_dim1) + ",";
168+
std::cerr << "Error. Header has grown too big to update in-place " << std::endl;
132169
}
133-
if (m_dim2 > 1)
134-
newShape += String(m_dim2);
135-
newShape += "), }";
136170
m_file->write(newShape.toUTF8(), newShape.getNumBytesAsUTF8());
171+
//m_file->flush(); // doesn't seem to be necessary, already flushed due to overwrite
172+
m_file->setPosition(currentPos); // restore position to end of file
137173
}
138174
else
139175
{
140-
std::cerr << "Error. Unable to seek to update header on file " << m_file->getFile().getFullPathName() << std::endl;
176+
std::cerr << "Error. Unable to seek to update file header"
177+
<< m_file->getFile().getFullPathName() << std::endl;
141178
}
142179
}
143180

181+
NpyFile::~NpyFile()
182+
{
183+
updateHeader();
184+
}
185+
144186
void NpyFile::writeData(const void* data, size_t size)
145187
{
146188
m_file->write(data, size);
@@ -149,9 +191,10 @@ void NpyFile::writeData(const void* data, size_t size)
149191
void NpyFile::increaseRecordCount(int count)
150192
{
151193
m_recordCount += count;
194+
if (m_recordCount % recordBufferSize == 0)
195+
updateHeader(); // also triggers a flush to disk
152196
}
153197

154-
155198
NpyType::NpyType(String n, BaseType t, size_t l)
156199
: name(n), type(t), length(l)
157200
{
@@ -173,11 +216,11 @@ String NpyType::getTypeString() const
173216
switch (type)
174217
{
175218
case BaseType::CHAR:
176-
return "S" + String(length + 1); //account for the null separator
219+
return "|S" + String(length + 1); // null-terminated bytes, account for null separator
177220
case BaseType::INT8:
178-
return "<i1";
221+
return "|i1";
179222
case BaseType::UINT8:
180-
return "<u1";
223+
return "|u1";
181224
case BaseType::INT16:
182225
return "<i2";
183226
case BaseType::UINT16:
@@ -195,7 +238,7 @@ String NpyType::getTypeString() const
195238
case BaseType::DOUBLE:
196239
return "<f8";
197240
default:
198-
return "<b1";
241+
return "|i1"; // signed byte
199242
}
200243
}
201244

@@ -215,4 +258,4 @@ String NpyType::getName() const
215258
BaseType NpyType::getType() const
216259
{
217260
return type;
218-
}
261+
}

Source/Plugins/BinaryWriter/NpyFile.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,23 @@ namespace BinaryRecordingEngine
5555
void increaseRecordCount(int count = 1);
5656
private:
5757
bool openFile(String path);
58+
String getShapeString();
5859
void writeHeader(const Array<NpyType>& typeList);
60+
void updateHeader();
5961
ScopedPointer<FileOutputStream> m_file;
62+
int64 m_headerLen; // total header length
6063
bool m_okOpen{ false };
6164
int64 m_recordCount{ 0 };
62-
size_t m_countPos;
65+
size_t m_shapePos;
6366
unsigned int m_dim1;
6467
unsigned int m_dim2;
68+
69+
// Compile-time constants
70+
71+
// flush file buffer to disk and update the .npy header every this many records:
72+
const int recordBufferSize{ 128 };
73+
6574
};
6675

6776
};
68-
#endif
77+
#endif

0 commit comments

Comments
 (0)