#ifndef __DataFormat_h_ #define __DataFormat_h_ #include #include #include #include #include "../shared/MiscSupport.h" #include "../shared/SmarterP.h" typedef uint16_t FieldId; /* This is only used internally. According to the internet, I should be able * to do a forward declaration, and put the body in the C file, but I couldn't * make that work. */ enum class StreamingDataType : char { UINT8_obsolete, // uint8_t INT16_obsolete, // int16_t INT32, // int32_t INT64, // int64_t FLOAT, // float DOUBLE, // double STRING1B, // variable length string, 0 - 255 bytes. STRING2B, // variable length string, 0 - 2^16-1 bytes DOUBLE0_obsolete, // Exactly 0.0. Store no additional data. DOUBLE3_obsolete, // Pack into 3 bytes. INT16_new, // int16_t, store directly in the header. DOUBLE_10000, // Store as int16_t and divide by 10000.0 DOUBLE_200, // Store as int16_t and divide by 200.0 }; /* We have a very strict model of the how these records work. One application * will create the record and stream it out to another application. That other * application will treat it as almost read only. In some cases the second * application will add/change the id number. This is different from, say, * ../shared/TwoDLookup.h, which has a very flexibly in-memory model. * RecordBuilder (and the corresponding Record reader class) are aimed at high * performance. They need to be fast and memory efficient. */ class RecordBuilder { public: typedef uint16_t Offset; // This structure is used to pack the data. It's also used by the Record // class to unpack the data. It has no value to any other classes. struct FixedInfo { FieldId fieldId; Offset offset; StreamingDataType type; bool operator <(FixedInfo const &other) const { return fieldId < other.fieldId; } bool operator ==(FixedInfo const &other) const { return fieldId == other.fieldId; } } __attribute__((packed)); // I put all of the fixed size information together to simplify things. In // particular, if you want to verify the header, you can just look at the // number of fields and the size of the entire packed structure. You don't // have to follow the offset pointer until you have found a specific record // that you want to follow. (An older version mixed the fieldId with the // variable sized information. This meant that you had to verify all offset // pointers in the header before trying to do a binary search.) // http://lemire.me/blog/archives/2012/05/31/data-alignment-for-speed-myth-or-reality/ private: // This lists some information about each field. This includes all fixed // size information. Because each record has a fixed size, we can easily // do a binary search on this vector. std::vector< FixedInfo > _fixedInfo; // This contains the information about each field that is not fixed size. // For example, the number 5 might fit into 1 byte, where the number // 0xf00000000 might require 8 bytes. std::string _variableInfo; // Automatically stores a pointer into the variable size area. void appendFieldHeader(FieldId id, StreamingDataType type); // Stores the value directly in the header. We use no space in the variable // size area. We use the pointer to the variable size area to store the // actual data. void appendFieldHeader(FieldId id, StreamingDataType type, int16_t value); public: void append(FieldId id, std::string const &value); void append(FieldId id, double value); void append(FieldId id, float value); void append(FieldId id, int16_t value); void append(FieldId id, int32_t value); void append(FieldId id, int64_t value); void append(FieldId id, int8_t value) { append(id, (int16_t)value); } void append(FieldId id, bool value) { append(id, (uint8_t)value); } void append(FieldId id, uint32_t value) { append(id, (int64_t)value); } void append(FieldId id, char const *value) { append(id, (std::string)value); } // See also ValueBox::appendTo(). // Force it to use 8 bytes. Reserve enough space that we could replace // the number with any 64 bit integer. void reserveInt(FieldId id, int64_t toAdd = 0); const bool empty() const { return _fixedInfo.empty(); } std::string exportAsString(); // Effectively the same as destination += exportAsString(), but more // efficient. void exportAndAppend(std::string &destination); }; /* These are used in memory, mostly in the ValueBox class. We have more options * on disk mostly to save space. In memory it's better to only use doubles * and 64 bit integers. However, we do have two formats for strings. * * "Internal" means "in memory" / "not packed into a record". "Internal" does * not mean "private". */ enum class InternalType : char { EMPTY, // NULL INTEGER, // int64_t DOUBLE, // double // This is a special purpose value aimed at the cache. We can store a // ValueBox with this set, and we know that the cache is empty. The // first time the cache tries to read this, it will compute the real // value and use that to replace this value. In previous revisions, // When we used a pointer to a value, we used NULL to serve this // purpose. So, when we're in this state, we don't expect a lot of // operations. Nothing more than you could do with a NULL pointer. CACHE_EMPTY, CHAR_STAR, /* A string of 0-14 characters, not counting the terminating * null. The string is stored directly in * ValueBox::_asCharStar. */ // Everything above this line does not use dynamic memory. Everything // below this line does require dynamic memory. STRING, /* A string of 15 or more characters, not counting the terminating * NULL. We dynamically allocated memory for this string and save * a pointer to the string in ValueBox::_value.asDynamicCharStar. * We used to use std::string in this case, instead of calling * new() and dispose() directly. That was helpful back when * std::string used copy-on-write, but it doesn't mean as much * any more. Also, at one time sizeof(std::string) was the same * as a pointer, so we could shove the string directly into * ValueBox::_value. If we tried that now we'd need to * dynamically allocate the std::string, which would in turn * usually allocate even more dynamic memory. */ // This Value holds an entire record. Records are not recursive. If we // see a Value that is an integer or a double, that might have been read // from a record. A value can also be an entire record, to help us with // caching. If you look for the 5 day average volume, you'll have to start // by looking up the alerts_daily record. Later, we might want to look up // the 10 day average volume. The cache manager doesn't know or care the // difference between a field and an entire record. RECORD }; class Record; typedef SmarterCP< Record > RecordRef; // I prefer Record::Ref to RecordRef. but the former was causing a circular // reference. You can't say "class Record::Ref". /* These are values in memory. They are often cached. They are often read * from a record. These are inexpensive to copy. Simple data, like an * integer, is stored directly. Big things, like a record, are stored in a * reference counted pointer. * * We do a limited amount of automatic conversion. Mostly that's promoting * an integer to a double. We don't try to convert a string to a double * because that's slow and we don't really need it. * * There is a specific value representing EMPTY / NULL. * * If you ask an object for its double value, and it contains a double, it just * copies from a field. If you ask an object for a double value, and it * contains an int, it automatically converts. If you ask object for a double * value, and the object is EMPTY, it will say that's not valid. The idea * is that almost any error is stored as an EMPTY value, and gets propagated * through most operations. * * Generally, any request that doesn't make sense will just set the valid flag * to false. We don't try to distinguish between types of errors. Arguably * you could treat 1+null differently from 1+{advol:10000, close_p:19.5}. The * former is missing data, which happens all the time and is to be expected. * The latter is a mistake in this program and could fail an assertion. * 1+"Block Print (NYSE)" could be a third case, an error in an upstream * program, which could write to the log file. For simplicity we report all * of these the same way. * * We create a lot of objects of this type. In particular, trying to evaluate * a where condition on an alert (or similar) will create a lot of these that * are all temporary. Most of those will be integers, doubles, or EMPTY. It * is very important for this type to be efficient. */ class ValueBox { private: InternalType _type; // Treat _asCharStar as an array of 15 bytes. We can use all the space // explicitly reserved by _asCharStar plus the space reserved by _value. // Previous revisions of the code only used the space in _value to store the // bytes. But the compiler automatically added padding to ValueBox so the // total size of ValueBox didn't change. We might as well use this space. // // Don't forget the terminating null. You can store a string with a length // of 14 or less because we always store a 0 at the end when we use // _asCharStar; char _asCharStar[7]; union { int64_t asInt; double asDouble; void *asPointer; char *asDynamicCharStar; } _value; // Notice the precondition. The internal type must be CHAR_STAR or STRING. // Use getString() if you aren't already certain of the type. This function // is for internal use so it's fast and not always safe. char const *asCharStar() const { return _type == InternalType::CHAR_STAR? _asCharStar:_value.asDynamicCharStar; } RecordRef const &asRecord() const { return *(RecordRef *)&_value.asPointer; } RecordRef &asRecord() { return *(RecordRef *)&_value.asPointer; } void releaseDynamicMemory(); // This will release any resources. After this call the exact state is // not defined. But the following are guaranteed: // a) This object holds no dynamic memory. It will be safe to exit a // destructor or to call assign() after calling release. // b) This object is in a valid state. It will be safe to return the object // to the user. void release() { // release() is pretty much the same as releaseDynamicMemory(). I broke // them into two functions for performance reasons. release() is inline // code, so the common case will be very fast. Releasing dynamic memory // is so much slower, there's no point in trying to inline all of // releaseDynamicMemory(). if (_type >= InternalType::STRING) releaseDynamicMemory(); } // The various forms of assign will NOT release any old resources. Call // these from the constructor or call them after calling release(). void assign(int64_t value) { _type = InternalType::INTEGER; _value.asInt = value; } void assign(double value) { _type = InternalType::DOUBLE; _value.asDouble = value; } void assign(std::string const &value); void assign(char const *value); void assign(char const *value, size_t length); void assign(RecordRef const &value); void assignDynamicMemory(ValueBox const &other); void assign(ValueBox const &other) { if (other._type >= InternalType::STRING) assignDynamicMemory(other); else // I'm pretty sure this is safe. I've never completely understood the // strict aliasing rules and I can't find anything about memcpy(). // This is as close as I could get: https://blog.regehr.org/archives/959 memcpy(this, &other, sizeof(ValueBox)); } // Note: && is not automatically inherited from the calling function. // Consider // void a(int &arg) { std::cout<<"&"; } // void a(int const &arg) { std::cout<<"const &"; } // void a(int &&arg) { std::cout<<"&&"; } // void b(int &arg) { a(arg); } // void b(int const &arg) { a(arg); } // void b(int &&arg) { a(arg); } // In some cases the compiler could be smart enough to call the third version // of a() on its own. But the third version of b() will call the first // version of a. void destructiveAssign(ValueBox &other) { memcpy(this, &other, sizeof(ValueBox)); other._type = InternalType::EMPTY; } public: InternalType getType() const { return _type; } void getInt(bool &valid, int64_t &value) const { if (_type == InternalType::INTEGER) { valid = true; value = _value.asInt; } else valid = false; } void getDouble(bool &valid, double &value) const { switch (_type) { case InternalType::INTEGER: valid = true; value = _value.asInt; break; case InternalType::DOUBLE: valid = true; value = _value.asDouble; break; default: valid = false; } } void getString(bool &valid, std::string &value) const; // Warning: This will return a pointer into this object. Don't call this // method on a temporary object. // // Correct: // const ValueBox RHS = _arg2->execute(recordInfo); // RHS.getString(valid, rightString); // Incorrect: // _arg2->execute(recordInfo).getString(valid, rightString); // // This, of course, only applies to the char * version of this function. // The std::string versions of getSrting() can use a temporary object with // no problems. And, of course, getInt(), getDouble, and getBoolean() don't // have any problems. // // See revision 1.41 of Strategy.C and revision 1.38 of Execution.C for // examples of the right way to do it. The previous revision of each // was done the wrong way. void getString(bool &valid, char const *&value) const; void getBoolean(bool &valid, bool &value) const { switch (_type) { case InternalType::INTEGER: valid = true; value = _value.asInt; break; case InternalType::DOUBLE: valid = true; value = _value.asDouble; break; default: valid = false; break; } } // This converts invalid to false. // Converting EMPTY/NULL to false is common in MySql, and thus in our older // code. // Converting strings to Booleans is different here from MySql. But that // shouldn't affect any existing code. We offer a constrained set of // operations to the user, and he just can't use a string as a Boolean. bool getBoolean() const { switch (_type) { case InternalType::INTEGER: return _value.asInt; case InternalType::DOUBLE: return _value.asDouble; default: return false; } } // These will return the value if it is valid. If it is not valid, they will // fail an assertion, throw and exception, or something. You should avoid // that case. It's very common to read the type first, and then have a // case statement. Once you've read the type, you should know if one of // these will work or not. If assertions are turned off and you request // an invalid value, there are no promises as to what you will get. int64_t getInt() const; double getDouble() const; bool isEmpty() const { return getType() == InternalType::EMPTY; } // When we are in the cache empty state, there isn't much you can do // with this object. In particular, getBoolean(), getInt(), getDouble(), // etc., are likely to fail an assertion or throw an exception. It is // reasonable to assign a different value, copy the current value, // call the destructor, or ask for the current type. This is a lot // like having a null pointer to an object. void setCacheEmpty() { release(); _type = InternalType::CACHE_EMPTY; } bool getCacheEmpty() const { return getType() == InternalType::CACHE_EMPTY; } // With any sort of error this will return the special EMPTY value. ValueBox lookUpValue(FieldId fieldId) const; // Same type and value. E.g the integer 4 is different from 4.0. This is // a shallow compare for records. Two values which point to the exact // same record will be the same. Note that one EMPTY value is equal // another. If you say == in a custom formula we will NOT use this function. // Instead, we will say 4 == 4.0 is true, and EMPTY == EMPTY is EMPTY. // // CHAR_STAR and STRING can be equal. That's an implementation detail. // (In an older implementation of this files some strings can be stored // either way. That's not true of the current implementation, but the // interface hasn't changed.) // // This function was called "equal" in previous revisions. bool operator ==(ValueBox const &other) const; bool operator !=(ValueBox const &other) const { return !(*this == other); } // This will format the value in a form appropriate to send to the client // software. // // We fill in a value in a PropertyList because this gives us a quick and // easy to way to say that we have no interesting data. In particular, // that's how we normally send a NULL to the user. We expect that this will // be used to fill in the properties of an XML node. // // If the name already exists in the list, the result is inconsistent. It's // guaranteed not to crash, or anything like that. But here's what's likely // to happen. If you overwrite a value with something real, like the number // 5, then the new value will replace the old value. If you try to write a // NULL, absolutely nothing happens. So if you try to overwrite a real value // with a NULL, the old value will remain. I don't think anyone will // purposely overwrite one value with another, and making the results more // consistent would be expensive for the common case, so that's why I left // the details undefined. void writeToClient(PropertyList &destination, std::string const &name) const; std::string shortDebug() const; ValueBox() : _type(InternalType::EMPTY) { } ValueBox(int64_t value) { assign(value); } ValueBox(double value) { if (std::isfinite(value)) assign(value); else _type = InternalType::EMPTY; } ValueBox(std::string const &value); ValueBox(char const *value); ValueBox(char const *value, int length) { assign(value, length); } ValueBox(RecordRef const &value); // int seems to have a stronger pull than other types. If I didn't have // this, ValueBox(4) would be ambiguous between double and int64_t. // This will not only attract ints, but also shorts, chars, bools, etc. ValueBox(int value) { assign((int64_t)value); } ValueBox(ValueBox const &other) { assign(other); } ValueBox(ValueBox &&other) { destructiveAssign(other); } ~ValueBox() { release(); } void operator =(ValueBox const &other) { release(); assign(other); } void operator =(ValueBox &&other) { release(); destructiveAssign(other); } // Like all append() operations, you should not repeat a fieldId. For // integers, doubles, and strings, this will extract the value from the // value box and add it to the record. NULLs are silently ignored, as they // should be, because NULLs are never explicitly stored in a record. Other // types (like a record or a cache miss) should not be sent here. That might // throw an exception, fail an assertion, or silently do nothing. Don't do // it! void appendTo(RecordBuilder &destination, FieldId fieldId) const; }; /* This is primarily aimed at reading a record created elsewhere. * * We have a limited ability to modify the record using this class. We can * update a value, but it must already exist and we must have reserved enough * space in advance. The idea is to use this for the id field in the alerts. */ // Proposed changes: A record with 0 fields should be different from an error. // an entry in the alerts or top list table will never have 0 fields, but we // might use these records in other places, like the result of a database // query. class Record : NoCopy, NoAssign { private: typedef RecordBuilder::Offset Offset; typedef RecordBuilder::FixedInfo FixedInfo; char *_encodedStart; size_t _encodedLength; bool _needToDeleteEncoded; // For simplicity we say that setting _fieldCount to 0 is a way of reporting // an error. This is nice because if you tried to look something up, you'd // automatically stop right away. You wouldn't have to look for an error // flag. And we expect a bad record and an empty record to act the same way: // no matter what you ask for, it doesn't exist. The only down side is that // it limits your error messages. You can't distinguish between a valid // record with no data and an invalid record. But I don't see any value to // an empty record -- you could never find it with an index -- so we'll just // say that's illegal. But see the proposed changes, above. Offset _fieldCount; bool success() const { return _fieldCount; } // fixedInfo()[0].offset + _variableInfoStart + _encodedStart is the start // of the variable part of the first field. _fixedInfo sorts fields by their // FieldId. FixedInfo const *fixedInfo() const { return (FixedInfo const *)(_encodedStart + sizeof(Offset)); } // The end of the fixed sized part of the fields. This is relative to // _encodedStart. int _variableInfoStart; // This is NULL if the field is not found. FixedInfo const *findField(FieldId id) const; friend class SmarterPBase< Record >; Record(char *encodedStart, size_t encodedLength, bool needToDeleteEncoded); public: // The idea is to create the object, possibly make a few small modifications, // then treat it as read only and pass it to multiple threads. typedef SmarterCP< Record > Ref; static SmarterP< Record > create(std::string const &encoded); static SmarterP< Record > createCopy(char const *start, size_t length); static Ref createShare(char const *start, size_t length); std::string getEncoded() const { return std::string(_encodedStart, _encodedLength); } char const *getEncodedStart() const { return _encodedStart; } size_t getEncodedLength() const { return _encodedLength; } ValueBox lookUpValue(FieldId fieldId) const; // Returns true on success, false on failure. bool update(FieldId fieldId, int64_t newValue); std::string debugDump() const; std::string shortDebug() const; std::vector< RecordBuilder::FixedInfo > debugGetFieldInfo() const; static std::string const &asString(StreamingDataType type); std::vector< FieldId > getFields() const; ~Record() { if (_needToDeleteEncoded) delete[](_encodedStart); } // A record with no fields. This might be convenient sometimes. Create will // return NULL rather than a record with no values, which is // indistinguishable from an error. An EMPTY value will have a similar // meaning, but sometimes you really want a record. static const Ref EMPTY; }; #endif