diff --git a/scripts/docker-integration-tests/simple/test.sh b/scripts/docker-integration-tests/simple/test.sh index d85be4fa92..3cedecc279 100755 --- a/scripts/docker-integration-tests/simple/test.sh +++ b/scripts/docker-integration-tests/simple/test.sh @@ -142,25 +142,25 @@ curl -vvvsS -X POST 0.0.0.0:9003/writetagged -d '{ } }' +function test_query_result { + queryResult=$(curl -sSf -X POST 0.0.0.0:9003/query -d '{ + "namespace": "unagg", + "query": { + "regexp": { + "field": "city", + "regexp": ".*" + } + }, + "rangeStart": 0, + "rangeEnd":'"$(date +"%s")"' + }' | jq '.results | length') + test "$queryResult" -eq "1" + return $? +} + echo "Read data" -queryResult=$(curl -sSf -X POST 0.0.0.0:9003/query -d '{ - "namespace": "unagg", - "query": { - "regexp": { - "field": "city", - "regexp": ".*" - } - }, - "rangeStart": 0, - "rangeEnd":'"$(date +"%s")"' -}' | jq '.results | length') - -if [ "$queryResult" -lt 1 ]; then - echo "Result not found" - exit 1 -else - echo "Result found" -fi +ATTEMPTS=10 MAX_TIMEOUT=1 TIMEOUT=.1 retry_with_backoff \ + test_query_result echo "Deleting placement" curl -vvvsSf -X DELETE 0.0.0.0:7201/api/v1/services/m3db/placement diff --git a/src/cmd/tools/read_index_segments/main/main.go b/src/cmd/tools/read_index_segments/main/main.go index 7a927d9c9d..0290faa73a 100644 --- a/src/cmd/tools/read_index_segments/main/main.go +++ b/src/cmd/tools/read_index_segments/main/main.go @@ -154,7 +154,7 @@ func readNamespaceSegments( ) { var ( infoFiles = fs.ReadIndexInfoFiles(fsOpts.FilePathPrefix(), nsID, - fsOpts.InfoReaderBufferSize()) + fsOpts.InfoReaderBufferSize(), persist.FileSetFlushType) wg sync.WaitGroup ) diff --git a/src/dbnode/digest/buffer.go b/src/dbnode/digest/buffer.go index a5cbeb8c77..3aea0c8923 100644 --- a/src/dbnode/digest/buffer.go +++ b/src/dbnode/digest/buffer.go @@ -22,6 +22,7 @@ package digest import ( "encoding/binary" + "errors" "os" ) @@ -33,6 +34,9 @@ const ( var ( // Endianness is little endian endianness = binary.LittleEndian + // errBufferSizeTooSmall is for when the buffer passed in to be + // converted to a digest buffer is of insufficient size. + errBufferSizeTooSmall = errors.New("buffer size too small") ) // Buffer is a byte slice that facilitates digest reading and writing. @@ -70,6 +74,9 @@ func (b Buffer) ReadDigestFromFile(fd *os.File) (uint32, error) { } // ToBuffer converts a byte slice to a digest buffer. -func ToBuffer(buf []byte) Buffer { - return Buffer(buf[:DigestLenBytes]) +func ToBuffer(buf []byte) (Buffer, error) { + if len(buf) < DigestLenBytes { + return nil, errBufferSizeTooSmall + } + return Buffer(buf[:DigestLenBytes]), nil } diff --git a/src/dbnode/digest/buffer_test.go b/src/dbnode/digest/buffer_test.go index 788256eff3..ac9f10207b 100644 --- a/src/dbnode/digest/buffer_test.go +++ b/src/dbnode/digest/buffer_test.go @@ -52,10 +52,16 @@ func TestWriteDigestToFile(t *testing.T) { } func TestReadDigest(t *testing.T) { - buf := ToBuffer([]byte{0x0, 0x1, 0x0, 0x1, 0x0, 0x1}) + buf, err := ToBuffer([]byte{0x0, 0x1, 0x0, 0x1, 0x0, 0x1}) + require.NoError(t, err) require.Equal(t, uint32(0x1000100), buf.ReadDigest()) } +func TestReadDigestInsufficientBytes(t *testing.T) { + _, err := ToBuffer([]byte{0x0}) + require.Equal(t, err, errBufferSizeTooSmall) +} + func TestReadDigestFromFile(t *testing.T) { fd := createTempFile(t) defer func() { diff --git a/src/dbnode/generated/proto/index/index.pb.go b/src/dbnode/generated/proto/index/index.pb.go index b81ca162ec..1eefff3c51 100644 --- a/src/dbnode/generated/proto/index/index.pb.go +++ b/src/dbnode/generated/proto/index/index.pb.go @@ -55,6 +55,27 @@ var _ = math.Inf // proto package needs to be updated. const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package +type SegmentState int32 + +const ( + SegmentState_COMPACTABLE_SEGMENT_STATE SegmentState = 0 + SegmentState_FROZEN_SEGMENT_STATE SegmentState = 1 +) + +var SegmentState_name = map[int32]string{ + 0: "COMPACTABLE_SEGMENT_STATE", + 1: "FROZEN_SEGMENT_STATE", +} +var SegmentState_value = map[string]int32{ + "COMPACTABLE_SEGMENT_STATE": 0, + "FROZEN_SEGMENT_STATE": 1, +} + +func (x SegmentState) String() string { + return proto.EnumName(SegmentState_name, int32(x)) +} +func (SegmentState) EnumDescriptor() ([]byte, []int) { return fileDescriptorIndex, []int{0} } + type IndexVolumeInfo struct { MajorVersion int64 `protobuf:"varint,1,opt,name=majorVersion,proto3" json:"majorVersion,omitempty"` BlockStart int64 `protobuf:"varint,2,opt,name=blockStart,proto3" json:"blockStart,omitempty"` @@ -64,6 +85,7 @@ type IndexVolumeInfo struct { SnapshotTime int64 `protobuf:"varint,6,opt,name=snapshotTime,proto3" json:"snapshotTime,omitempty"` Segments []*SegmentInfo `protobuf:"bytes,7,rep,name=segments" json:"segments,omitempty"` IndexVolumeType *google_protobuf.StringValue `protobuf:"bytes,8,opt,name=indexVolumeType" json:"indexVolumeType,omitempty"` + SnapshotID []byte `protobuf:"bytes,9,opt,name=snapshotID,proto3" json:"snapshotID,omitempty"` } func (m *IndexVolumeInfo) Reset() { *m = IndexVolumeInfo{} } @@ -127,12 +149,20 @@ func (m *IndexVolumeInfo) GetIndexVolumeType() *google_protobuf.StringValue { return nil } +func (m *IndexVolumeInfo) GetSnapshotID() []byte { + if m != nil { + return m.SnapshotID + } + return nil +} + type SegmentInfo struct { SegmentType string `protobuf:"bytes,1,opt,name=segmentType,proto3" json:"segmentType,omitempty"` MajorVersion int64 `protobuf:"varint,2,opt,name=majorVersion,proto3" json:"majorVersion,omitempty"` MinorVersion int64 `protobuf:"varint,3,opt,name=minorVersion,proto3" json:"minorVersion,omitempty"` Metadata []byte `protobuf:"bytes,4,opt,name=metadata,proto3" json:"metadata,omitempty"` Files []*SegmentFileInfo `protobuf:"bytes,5,rep,name=files" json:"files,omitempty"` + SegmentState SegmentState `protobuf:"varint,6,opt,name=segmentState,proto3,enum=index.SegmentState" json:"segmentState,omitempty"` } func (m *SegmentInfo) Reset() { *m = SegmentInfo{} } @@ -175,6 +205,13 @@ func (m *SegmentInfo) GetFiles() []*SegmentFileInfo { return nil } +func (m *SegmentInfo) GetSegmentState() SegmentState { + if m != nil { + return m.SegmentState + } + return SegmentState_COMPACTABLE_SEGMENT_STATE +} + type SegmentFileInfo struct { SegmentFileType string `protobuf:"bytes,1,opt,name=segmentFileType,proto3" json:"segmentFileType,omitempty"` } @@ -270,6 +307,7 @@ func init() { proto.RegisterType((*IndexDigests)(nil), "index.IndexDigests") proto.RegisterType((*SegmentDigest)(nil), "index.SegmentDigest") proto.RegisterType((*SegmentFileDigest)(nil), "index.SegmentFileDigest") + proto.RegisterEnum("index.SegmentState", SegmentState_name, SegmentState_value) } func (m *IndexVolumeInfo) Marshal() (dAtA []byte, err error) { size := m.Size() @@ -350,6 +388,12 @@ func (m *IndexVolumeInfo) MarshalTo(dAtA []byte) (int, error) { } i += n3 } + if len(m.SnapshotID) > 0 { + dAtA[i] = 0x4a + i++ + i = encodeVarintIndex(dAtA, i, uint64(len(m.SnapshotID))) + i += copy(dAtA[i:], m.SnapshotID) + } return i, nil } @@ -402,6 +446,11 @@ func (m *SegmentInfo) MarshalTo(dAtA []byte) (int, error) { i += n } } + if m.SegmentState != 0 { + dAtA[i] = 0x30 + i++ + i = encodeVarintIndex(dAtA, i, uint64(m.SegmentState)) + } return i, nil } @@ -573,6 +622,10 @@ func (m *IndexVolumeInfo) Size() (n int) { l = m.IndexVolumeType.Size() n += 1 + l + sovIndex(uint64(l)) } + l = len(m.SnapshotID) + if l > 0 { + n += 1 + l + sovIndex(uint64(l)) + } return n } @@ -599,6 +652,9 @@ func (m *SegmentInfo) Size() (n int) { n += 1 + l + sovIndex(uint64(l)) } } + if m.SegmentState != 0 { + n += 1 + sovIndex(uint64(m.SegmentState)) + } return n } @@ -919,6 +975,37 @@ func (m *IndexVolumeInfo) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 9: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field SnapshotID", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowIndex + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthIndex + } + postIndex := iNdEx + byteLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.SnapshotID = append(m.SnapshotID[:0], dAtA[iNdEx:postIndex]...) + if m.SnapshotID == nil { + m.SnapshotID = []byte{} + } + iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipIndex(dAtA[iNdEx:]) @@ -1098,6 +1185,25 @@ func (m *SegmentInfo) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 6: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field SegmentState", wireType) + } + m.SegmentState = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowIndex + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.SegmentState |= (SegmentState(b) & 0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipIndex(dAtA[iNdEx:]) @@ -1616,36 +1722,41 @@ func init() { } var fileDescriptorIndex = []byte{ - // 484 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x92, 0xcf, 0x8a, 0xdb, 0x30, - 0x10, 0xc6, 0x6b, 0xbb, 0x49, 0xb3, 0xe3, 0xa4, 0x69, 0x45, 0x59, 0xcc, 0xb2, 0x18, 0xe3, 0x93, - 0x0f, 0xc5, 0x86, 0xe4, 0xd8, 0x42, 0xa1, 0x94, 0x85, 0xbd, 0x3a, 0xdb, 0xdc, 0xe5, 0x58, 0x71, - 0xd4, 0xda, 0x92, 0x91, 0x14, 0xfa, 0xe7, 0x29, 0xfa, 0x4a, 0xa5, 0x97, 0x1e, 0xfb, 0x08, 0x25, - 0x7d, 0x91, 0x22, 0xc9, 0x9b, 0x38, 0xc9, 0x1e, 0xf6, 0x12, 0xf8, 0xbe, 0xf9, 0xe4, 0x99, 0xf9, - 0x65, 0xe0, 0x5d, 0x45, 0xd5, 0x66, 0x5b, 0xa4, 0x2b, 0xde, 0x64, 0xcd, 0xbc, 0x2c, 0xb2, 0x66, - 0x9e, 0x49, 0xb1, 0xca, 0xca, 0x82, 0xf1, 0x92, 0x64, 0x15, 0x61, 0x44, 0x60, 0x45, 0xca, 0xac, - 0x15, 0x5c, 0xf1, 0x8c, 0xb2, 0x92, 0x7c, 0xb5, 0xbf, 0xa9, 0x71, 0xd0, 0xc0, 0x88, 0xab, 0xb0, - 0xe2, 0xbc, 0xaa, 0x89, 0x8d, 0x15, 0xdb, 0x75, 0xf6, 0x45, 0xe0, 0xb6, 0x25, 0x42, 0xda, 0x58, - 0xfc, 0xcb, 0x85, 0xe9, 0xad, 0x4e, 0x2e, 0x79, 0xbd, 0x6d, 0xc8, 0x2d, 0x5b, 0x73, 0x14, 0xc3, - 0xb8, 0xc1, 0x9f, 0xb8, 0x58, 0x12, 0x21, 0x29, 0x67, 0x81, 0x13, 0x39, 0x89, 0x97, 0x1f, 0x79, - 0x28, 0x04, 0x28, 0x6a, 0xbe, 0xfa, 0xbc, 0x50, 0x58, 0xa8, 0xc0, 0x35, 0x89, 0x9e, 0x83, 0xae, - 0xe1, 0xc2, 0x2a, 0xfa, 0x9d, 0x04, 0x9e, 0x29, 0x1f, 0x0c, 0x74, 0x05, 0xa3, 0x35, 0xad, 0xc9, - 0xdd, 0xb7, 0x96, 0x04, 0x4f, 0x4d, 0x71, 0xaf, 0xd1, 0x25, 0x0c, 0xe5, 0x06, 0x8b, 0x52, 0x06, - 0x83, 0xc8, 0x4b, 0x26, 0x79, 0xa7, 0xf4, 0x54, 0x92, 0xe1, 0x56, 0x6e, 0xb8, 0xba, 0xa3, 0x0d, - 0x09, 0x86, 0x76, 0xaa, 0xbe, 0x87, 0x52, 0x18, 0x49, 0x52, 0x35, 0x84, 0x29, 0x19, 0x3c, 0x8b, - 0xbc, 0xc4, 0x9f, 0xa1, 0xd4, 0x42, 0x59, 0x58, 0x5b, 0xef, 0x97, 0xef, 0x33, 0xe8, 0x06, 0xa6, - 0xf4, 0xb0, 0xbc, 0x19, 0x67, 0x14, 0x39, 0x89, 0x3f, 0xbb, 0x4e, 0x2d, 0xb7, 0xf4, 0x9e, 0x5b, - 0xba, 0x50, 0x82, 0xb2, 0x6a, 0x89, 0xeb, 0x2d, 0xc9, 0x4f, 0x1f, 0xc5, 0x3f, 0x1d, 0xf0, 0x7b, - 0x1d, 0x50, 0x04, 0x7e, 0xd7, 0xc3, 0x7c, 0x53, 0x03, 0xbc, 0xc8, 0xfb, 0xd6, 0x19, 0x63, 0xf7, - 0x01, 0xc6, 0x3a, 0x43, 0xd9, 0x21, 0xe3, 0x75, 0x99, 0x9e, 0xa7, 0x49, 0x36, 0x44, 0xe1, 0x12, - 0x2b, 0x6c, 0x48, 0x8e, 0xf3, 0xbd, 0x46, 0xaf, 0x61, 0xa0, 0xa9, 0x5a, 0x90, 0xfe, 0xec, 0xf2, - 0x18, 0xc5, 0x0d, 0xad, 0xcd, 0xdf, 0x9d, 0xdb, 0x50, 0xfc, 0x06, 0xa6, 0x27, 0x15, 0x94, 0xc0, - 0x54, 0x1e, 0xac, 0xde, 0x2a, 0xa7, 0x76, 0x5c, 0xc3, 0xd8, 0x5c, 0xd1, 0x07, 0x5a, 0x11, 0xa9, - 0xa4, 0x3e, 0x0f, 0xca, 0xd6, 0xdc, 0x4a, 0xf3, 0x68, 0x92, 0xf7, 0x1c, 0xf4, 0x16, 0x9e, 0x77, - 0x9f, 0xe8, 0x5e, 0x04, 0xae, 0x99, 0xf1, 0xd5, 0xf1, 0x8c, 0xb6, 0x98, 0x9f, 0x64, 0x63, 0x0c, - 0x93, 0xa3, 0xc0, 0x23, 0x78, 0xa7, 0xf7, 0x2c, 0x6c, 0x9f, 0xe0, 0x9c, 0x45, 0xd7, 0xab, 0xa3, - 0xf1, 0x11, 0x5e, 0x9e, 0xd5, 0x1e, 0xcf, 0x43, 0x1f, 0x71, 0x69, 0x77, 0x77, 0xcd, 0xee, 0x9d, - 0x7a, 0xff, 0xe2, 0xf7, 0x2e, 0x74, 0xfe, 0xec, 0x42, 0xe7, 0xef, 0x2e, 0x74, 0x7e, 0xfc, 0x0b, - 0x9f, 0x14, 0x43, 0x73, 0x61, 0xf3, 0xff, 0x01, 0x00, 0x00, 0xff, 0xff, 0x48, 0x32, 0xee, 0x47, - 0xf1, 0x03, 0x00, 0x00, + // 574 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x93, 0xcf, 0x6e, 0x9b, 0x4c, + 0x14, 0xc5, 0x83, 0xfd, 0x25, 0x5f, 0x32, 0x26, 0x71, 0x3a, 0x8d, 0x22, 0x1a, 0xa5, 0x08, 0xb1, + 0x42, 0x55, 0x05, 0x92, 0xbd, 0xe8, 0xa2, 0x95, 0x2a, 0x27, 0xb1, 0x23, 0x4b, 0x4d, 0x52, 0x0d, + 0xd4, 0x8b, 0x6e, 0xa2, 0xc1, 0x8c, 0xf1, 0xb4, 0xc0, 0x58, 0xcc, 0x58, 0xfd, 0xf3, 0x14, 0x7d, + 0xac, 0x2e, 0xfb, 0x08, 0x95, 0xf3, 0x16, 0x5d, 0x55, 0xcc, 0x60, 0x1b, 0xec, 0x2e, 0xb2, 0x41, + 0xba, 0xe7, 0x9e, 0x61, 0xce, 0xfd, 0x5d, 0x00, 0x6f, 0x63, 0x2a, 0xa6, 0xf3, 0xd0, 0x1d, 0xb3, + 0xd4, 0x4b, 0xbb, 0x51, 0xe8, 0xa5, 0x5d, 0x8f, 0xe7, 0x63, 0x2f, 0x0a, 0x33, 0x16, 0x11, 0x2f, + 0x26, 0x19, 0xc9, 0xb1, 0x20, 0x91, 0x37, 0xcb, 0x99, 0x60, 0x1e, 0xcd, 0x22, 0xf2, 0x55, 0x3d, + 0x5d, 0xa9, 0xc0, 0x5d, 0x59, 0x9c, 0x99, 0x31, 0x63, 0x71, 0x42, 0x94, 0x2d, 0x9c, 0x4f, 0xbc, + 0x2f, 0x39, 0x9e, 0xcd, 0x48, 0xce, 0x95, 0xcd, 0x7e, 0x68, 0x80, 0xf6, 0xb0, 0x70, 0x8e, 0x58, + 0x32, 0x4f, 0xc9, 0x30, 0x9b, 0x30, 0x68, 0x03, 0x3d, 0xc5, 0x9f, 0x58, 0x3e, 0x22, 0x39, 0xa7, + 0x2c, 0x33, 0x34, 0x4b, 0x73, 0x9a, 0xa8, 0xa6, 0x41, 0x13, 0x80, 0x30, 0x61, 0xe3, 0xcf, 0xbe, + 0xc0, 0xb9, 0x30, 0x1a, 0xd2, 0x51, 0x51, 0xe0, 0x39, 0x38, 0x50, 0x15, 0xfd, 0x4e, 0x8c, 0xa6, + 0x6c, 0xaf, 0x05, 0x78, 0x06, 0xf6, 0x27, 0x34, 0x21, 0xc1, 0xb7, 0x19, 0x31, 0xfe, 0x93, 0xcd, + 0x55, 0x0d, 0x4f, 0xc1, 0x1e, 0x9f, 0xe2, 0x3c, 0xe2, 0xc6, 0xae, 0xd5, 0x74, 0x0e, 0x51, 0x59, + 0x15, 0xa9, 0x78, 0x86, 0x67, 0x7c, 0xca, 0x44, 0x40, 0x53, 0x62, 0xec, 0xa9, 0x54, 0x55, 0x0d, + 0xba, 0x60, 0x9f, 0x93, 0x38, 0x25, 0x99, 0xe0, 0xc6, 0xff, 0x56, 0xd3, 0x69, 0x75, 0xa0, 0xab, + 0xa0, 0xf8, 0x4a, 0x2e, 0xe6, 0x43, 0x2b, 0x0f, 0x1c, 0x80, 0x36, 0x5d, 0x0f, 0x2f, 0xe3, 0xec, + 0x5b, 0x9a, 0xd3, 0xea, 0x9c, 0xbb, 0x8a, 0x9b, 0xbb, 0xe4, 0xe6, 0xfa, 0x22, 0xa7, 0x59, 0x3c, + 0xc2, 0xc9, 0x9c, 0xa0, 0xcd, 0x43, 0x05, 0x8d, 0x65, 0x8e, 0xe1, 0x95, 0x71, 0x60, 0x69, 0x8e, + 0x8e, 0x2a, 0x8a, 0xfd, 0x47, 0x03, 0xad, 0x4a, 0x02, 0x68, 0x81, 0x56, 0x99, 0x41, 0xde, 0x59, + 0x00, 0x3e, 0x40, 0x55, 0x69, 0x6b, 0x07, 0x8d, 0x7f, 0xec, 0xa0, 0xf0, 0xd0, 0x6c, 0xed, 0x69, + 0x96, 0x9e, 0x8a, 0x56, 0x90, 0x4e, 0x89, 0xc0, 0x11, 0x16, 0x58, 0x92, 0xd6, 0xd1, 0xaa, 0x86, + 0x2f, 0xc1, 0x6e, 0x41, 0x5d, 0x81, 0x6e, 0x75, 0x4e, 0xeb, 0xa8, 0x06, 0x34, 0x91, 0x9f, 0x03, + 0x52, 0x26, 0xf8, 0x0a, 0xe8, 0x65, 0x40, 0x5f, 0x60, 0xa1, 0xf8, 0x1f, 0x75, 0x9e, 0xd6, 0x0f, + 0xc9, 0x16, 0xaa, 0x19, 0xed, 0xd7, 0xa0, 0xbd, 0xf1, 0x4a, 0xe8, 0x80, 0x36, 0x5f, 0x4b, 0x15, + 0x06, 0x9b, 0xb2, 0x9d, 0x00, 0x5d, 0x7e, 0x9e, 0x57, 0x34, 0x26, 0x5c, 0xf0, 0x82, 0x34, 0xcd, + 0x26, 0x4c, 0x95, 0xf2, 0xd0, 0x21, 0xaa, 0x28, 0xf0, 0x0d, 0x38, 0x2a, 0x5f, 0x51, 0x9e, 0x30, + 0x1a, 0x72, 0xb8, 0x93, 0x7a, 0x4e, 0xd5, 0x44, 0x1b, 0x5e, 0x1b, 0x83, 0xc3, 0x9a, 0xe1, 0x11, + 0x8b, 0x72, 0x97, 0x10, 0xd5, 0x3d, 0xc6, 0x36, 0xc4, 0xf2, 0x2e, 0x65, 0xb3, 0x3f, 0x80, 0x27, + 0x5b, 0xbd, 0xc7, 0xf3, 0x28, 0xfe, 0x8e, 0x48, 0xcd, 0xde, 0x90, 0xb3, 0x97, 0xd5, 0x8b, 0x6b, + 0xa0, 0x57, 0x57, 0x00, 0x9f, 0x83, 0x67, 0x97, 0x77, 0x37, 0xef, 0x7b, 0x97, 0x41, 0xef, 0xe2, + 0x5d, 0xff, 0xde, 0xef, 0x5f, 0xdf, 0xf4, 0x6f, 0x83, 0x7b, 0x3f, 0xe8, 0x05, 0xfd, 0xe3, 0x1d, + 0x68, 0x80, 0x93, 0x01, 0xba, 0xfb, 0xd8, 0xbf, 0xdd, 0xe8, 0x68, 0x17, 0xc7, 0x3f, 0x17, 0xa6, + 0xf6, 0x6b, 0x61, 0x6a, 0xbf, 0x17, 0xa6, 0xf6, 0xe3, 0xc1, 0xdc, 0x09, 0xf7, 0xe4, 0x3f, 0xd0, + 0xfd, 0x1b, 0x00, 0x00, 0xff, 0xff, 0x81, 0x3b, 0x84, 0x1a, 0x93, 0x04, 0x00, 0x00, } diff --git a/src/dbnode/generated/proto/index/index.proto b/src/dbnode/generated/proto/index/index.proto index c1a0c9a4bc..e2877e658f 100644 --- a/src/dbnode/generated/proto/index/index.proto +++ b/src/dbnode/generated/proto/index/index.proto @@ -13,6 +13,12 @@ message IndexVolumeInfo { int64 snapshotTime = 6; repeated SegmentInfo segments = 7; google.protobuf.StringValue indexVolumeType = 8; + bytes snapshotID = 9; +} + +enum SegmentState { + COMPACTABLE_SEGMENT_STATE = 0; + FROZEN_SEGMENT_STATE = 1; } message SegmentInfo { @@ -21,6 +27,7 @@ message SegmentInfo { int64 minorVersion = 3; bytes metadata = 4; repeated SegmentFileInfo files = 5; + SegmentState segmentState = 6; } message SegmentFileInfo { diff --git a/src/dbnode/integration/commitlog_bootstrap_helpers.go b/src/dbnode/integration/commitlog_bootstrap_helpers.go index e9456ff1e2..1359b3f23f 100644 --- a/src/dbnode/integration/commitlog_bootstrap_helpers.go +++ b/src/dbnode/integration/commitlog_bootstrap_helpers.go @@ -156,7 +156,7 @@ func writeCommitLogDataBase( shardSet = s.ShardSet() tagEncoderPool = opts.FilesystemOptions().TagEncoderPool() tagSliceIter = ident.NewTagsIterator(ident.Tags{}) - writes int + writes int ) // Write out commit log data. @@ -218,11 +218,9 @@ func writeCommitLogDataBase( func writeSnapshotsWithPredicate( t *testing.T, s TestSetup, - opts commitlog.Options, data generate.SeriesBlocksByStart, volume int, namespace namespace.Metadata, - specifiedTS *time.Time, pred generate.WriteDatapointPredicate, snapshotInterval time.Duration, ) { @@ -231,3 +229,17 @@ func writeSnapshotsWithPredicate( namespace, s, data, volume, pred, snapshotInterval) require.NoError(t, err) } + +func writeIndexSnapshotsWithPredicate( + t *testing.T, + s TestSetup, + data generate.SeriesBlocksByStart, + namespace namespace.Metadata, + pred generate.WriteDatapointPredicate, + snapshotInterval time.Duration, +) { + // Write out snapshots + err := writeTestIndexSnapshotsToDiskWithPredicate( + namespace, s, data, pred, snapshotInterval) + require.NoError(t, err) +} diff --git a/src/dbnode/integration/commitlog_bootstrap_index_with_snapshots_test.go b/src/dbnode/integration/commitlog_bootstrap_index_with_snapshots_test.go new file mode 100644 index 0000000000..cad05d58b3 --- /dev/null +++ b/src/dbnode/integration/commitlog_bootstrap_index_with_snapshots_test.go @@ -0,0 +1,227 @@ +// +build integration + +// Copyright (c) 2020 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package integration + +import ( + "testing" + "time" + + "github.com/m3db/m3/src/dbnode/integration/generate" + "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/retention" + "github.com/m3db/m3/src/dbnode/storage/index" + "github.com/m3db/m3/src/m3ninx/idx" + "github.com/m3db/m3/src/x/ident" + + "github.com/stretchr/testify/require" +) + +func TestCommitLogIndexBootstrapWithSnapshots(t *testing.T) { + if testing.Short() { + t.SkipNow() // Just skip if we're doing a short run + } + + // Test setup + var ( + rOpts = retention.NewOptions().SetRetentionPeriod(12 * time.Hour) + blockSize = rOpts.BlockSize() + ) + + nsOpts := namespace.NewOptions(). + SetRetentionOptions(rOpts). + SetIndexOptions(namespace.NewIndexOptions(). + SetEnabled(true). + SetBlockSize(blockSize), + ).SetColdWritesEnabled(true) + ns1, err := namespace.NewMetadata(testNamespaces[0], nsOpts) + require.NoError(t, err) + ns2, err := namespace.NewMetadata(testNamespaces[1], nsOpts) + require.NoError(t, err) + opts := NewTestOptions(t). + SetNamespaces([]namespace.Metadata{ns1, ns2}) + + setup, err := NewTestSetup(t, opts, nil) + require.NoError(t, err) + defer setup.Close() + + commitLogOpts := setup.StorageOpts().CommitLogOptions(). + SetFlushInterval(defaultIntegrationTestFlushInterval) + setup.SetStorageOpts(setup.StorageOpts().SetCommitLogOptions(commitLogOpts)) + + log := setup.StorageOpts().InstrumentOptions().Logger() + log.Info("commit log bootstrap test") + + // Write test data + log.Info("generating data") + now := setup.NowFn()() + fooSeries := generate.Series{ + ID: ident.StringID("foo"), + Tags: ident.NewTags(ident.StringTag("city", "new_york"), ident.StringTag("foo", "foo")), + } + + barSeries := generate.Series{ + ID: ident.StringID("bar"), + Tags: ident.NewTags(ident.StringTag("city", "new_jersey")), + } + + bazSeries := generate.Series{ + ID: ident.StringID("baz"), + Tags: ident.NewTags(ident.StringTag("city", "seattle")), + } + + unindexedSeries := generate.Series{ + ID: ident.StringID("unindexed"), + } + + seriesMaps := generate.BlocksByStart([]generate.BlockConfig{ + { + IDs: []string{fooSeries.ID.String()}, + Tags: fooSeries.Tags, + NumPoints: 100, + Start: now.Add(-blockSize), + }, + { + IDs: []string{barSeries.ID.String()}, + Tags: barSeries.Tags, + NumPoints: 100, + Start: now.Add(-blockSize), + }, + { + IDs: []string{fooSeries.ID.String()}, + Tags: fooSeries.Tags, + NumPoints: 50, + Start: now, + }, + { + IDs: []string{bazSeries.ID.String()}, + // NB(bodu): Each dp adds 1 sec to the start time, therefore the baz series + // only exists in snapshots due to the snapshot interval being 1 minute. + // This tests whether or not we can properly bootstrap a series that we fully + // rely on snapshots for. + Tags: bazSeries.Tags, + NumPoints: 50, + Start: now.Truncate(blockSize), + }, + { + IDs: []string{unindexedSeries.ID.String()}, + Tags: ident.Tags{}, + NumPoints: 1, + Start: now, + }, + }) + + log.Info("writing data") + var ( + snapshotInterval = time.Minute + numDatapointsNotInSnapshots = 0 + numDatapointsNotInCommitLogs = 0 + snapshotsPred = func(dp generate.TestValue) bool { + blockStart := dp.Timestamp.Truncate(blockSize) + if dp.Timestamp.Before(blockStart.Add(snapshotInterval)) { + return true + } + + numDatapointsNotInSnapshots++ + return false + } + commitLogPred = func(dp generate.TestValue) bool { + blockStart := dp.Timestamp.Truncate(blockSize) + if dp.Timestamp.Equal(blockStart.Add(snapshotInterval)) || dp.Timestamp.After(blockStart.Add(snapshotInterval)) { + return true + } + + numDatapointsNotInCommitLogs++ + return false + } + ) + for _, ns := range []namespace.Metadata{ + ns1, + ns2, + } { + writeIndexSnapshotsWithPredicate( + t, setup, seriesMaps, ns, snapshotsPred, snapshotInterval) + writeSnapshotsWithPredicate( + t, setup, seriesMaps, 0, ns, snapshotsPred, snapshotInterval) + writeCommitLogDataWithPredicate( + t, setup, commitLogOpts, seriesMaps, ns, commitLogPred) + } + // Ensure we've excluded some dps from data/index snapshot and commitlog files. + require.True(t, numDatapointsNotInSnapshots > 0) // This num is 2x'ed but its fine. + require.True(t, numDatapointsNotInCommitLogs > 0) + log.Info("finished writing data") + + // Setup bootstrapper after writing data so filesystem inspection can find it. + setupCommitLogBootstrapperWithFSInspection(t, setup, commitLogOpts) + + setup.SetNowFn(now) + // Start the server with filesystem bootstrapper + require.NoError(t, setup.StartServer()) + log.Debug("server is now up") + + // Stop the server + defer func() { + require.NoError(t, setup.StopServer()) + log.Debug("server is now down") + }() + + // Verify in-memory data match what we expect - all writes from seriesMaps + // should be present + verifySeriesMaps(t, setup, testNamespaces[0], seriesMaps) + verifySeriesMaps(t, setup, testNamespaces[1], seriesMaps) + + // Issue some index queries + session, err := setup.M3DBClient().DefaultSession() + require.NoError(t, err) + + start := now.Add(-rOpts.RetentionPeriod()) + end := now.Add(blockSize) + queryOpts := index.QueryOptions{StartInclusive: start, EndExclusive: end} + + // Match all new_*r* + regexpQuery, err := idx.NewRegexpQuery([]byte("city"), []byte("new_.*r.*")) + require.NoError(t, err) + iter, fetchResponse, err := session.FetchTaggedIDs(ns1.ID(), + index.Query{Query: regexpQuery}, queryOpts) + require.NoError(t, err) + defer iter.Finalize() + + verifyQueryMetadataResults(t, iter, fetchResponse.Exhaustive, verifyQueryMetadataResultsOptions{ + namespace: ns1.ID(), + exhaustive: true, + expected: []generate.Series{fooSeries, barSeries}, + }) + + // Match all *e*e* + regexpQuery, err = idx.NewRegexpQuery([]byte("city"), []byte(".*e.*e.*")) + require.NoError(t, err) + iter, fetchResponse, err = session.FetchTaggedIDs(ns1.ID(), + index.Query{Query: regexpQuery}, queryOpts) + require.NoError(t, err) + defer iter.Finalize() + + verifyQueryMetadataResults(t, iter, fetchResponse.Exhaustive, verifyQueryMetadataResultsOptions{ + namespace: ns1.ID(), + exhaustive: true, + expected: []generate.Series{barSeries, bazSeries}, + }) +} diff --git a/src/dbnode/integration/commitlog_bootstrap_with_snapshots_test.go b/src/dbnode/integration/commitlog_bootstrap_with_snapshots_test.go index 60c50d3d10..4a62f78cbb 100644 --- a/src/dbnode/integration/commitlog_bootstrap_with_snapshots_test.go +++ b/src/dbnode/integration/commitlog_bootstrap_with_snapshots_test.go @@ -109,7 +109,7 @@ func testCommitLogBootstrapWithSnapshots(t *testing.T, setTestOpts setTestOption ) writeSnapshotsWithPredicate( - t, setup, commitLogOpts, seriesMaps, 0, ns1, nil, pred, snapshotInterval) + t, setup, seriesMaps, 0, ns1, pred, snapshotInterval) numDatapointsNotInCommitLogs := 0 writeCommitLogDataWithPredicate(t, setup, commitLogOpts, seriesMaps, ns1, func(dp generate.TestValue) bool { diff --git a/src/dbnode/integration/disk_snapshot_test.go b/src/dbnode/integration/disk_snapshot_test.go index 67d01437a9..bade4ab295 100644 --- a/src/dbnode/integration/disk_snapshot_test.go +++ b/src/dbnode/integration/disk_snapshot_test.go @@ -26,9 +26,13 @@ import ( "testing" "time" + "github.com/m3db/m3/src/dbnode/generated/proto/index" "github.com/m3db/m3/src/dbnode/integration/generate" "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/persist/fs" + "github.com/m3db/m3/src/dbnode/storage" + xclock "github.com/m3db/m3/src/x/clock" xtime "github.com/m3db/m3/src/x/time" "github.com/stretchr/testify/require" @@ -69,7 +73,7 @@ func TestDiskSnapshotSimple(t *testing.T) { // Start the server log := testSetup.StorageOpts().InstrumentOptions().Logger() - log.Debug("disk flush test") + log.Debug("disk flush snapshot test") require.NoError(t, testSetup.StartServer()) log.Debug("server is now up") @@ -198,3 +202,173 @@ func TestDiskSnapshotSimple(t *testing.T) { } } } + +func TestDiskIndexSnapshotSimple(t *testing.T) { + if testing.Short() { + t.SkipNow() // Just skip if we're doing a short run + } + // Test setup + var ( + nOpts = namespace.NewOptions(). + SetSnapshotEnabled(true) + bufferPast = 10 * time.Minute + bufferFuture = 10 * time.Minute + blockSize = time.Hour + ) + + nOpts = nOpts. + SetRetentionOptions(nOpts.RetentionOptions(). + SetBufferFuture(bufferFuture). + SetBufferPast(bufferPast). + SetBlockSize(blockSize)). + SetIndexOptions(namespace.NewIndexOptions(). + SetBlockSize(blockSize). + SetEnabled(true)). + SetColdWritesEnabled(true) + md1, err := namespace.NewMetadata(testNamespaces[0], nOpts) + require.NoError(t, err) + md2, err := namespace.NewMetadata(testNamespaces[1], nOpts) + require.NoError(t, err) + + testOpts := NewTestOptions(t). + SetTickMinimumInterval(time.Second). + SetNamespaces([]namespace.Metadata{md1, md2}) + testSetup, err := NewTestSetup(t, testOpts, nil) + require.NoError(t, err) + defer testSetup.Close() + + onColdFlush := &testOnColdFlush{ + // Force cold flush to be slow enough to be able + // to consistently check index snapshots before flush + // finishes. + sleepDuration: 10 * time.Second, + } + storageOpts := testSetup.StorageOpts(). + SetOnColdFlush(onColdFlush) + testSetup.SetStorageOpts(storageOpts) + + // Generate test index data + var ( + numWrites = 50 + numTags = 10 + nowFn = testSetup.NowFn() + ) + + t0 := nowFn().Truncate(blockSize).Add(-2 * blockSize) + t1 := t0.Add(blockSize) + t2 := t1.Add(blockSize) + writesPeriod0 := GenerateTestIndexWrite(0, numWrites, numTags, t0, t1) + writesPeriod1 := GenerateTestIndexWrite(1, numWrites, numTags, t1, t2) + + // Start the server + log := testSetup.StorageOpts().InstrumentOptions().Logger() + log.Debug("disk flush index snapshot test") + require.NoError(t, testSetup.StartServer()) + log.Debug("server is now up") + + // Stop the server + defer func() { + require.NoError(t, testSetup.StopServer()) + log.Debug("server is now down") + }() + + // Write index data + start := nowFn() + session, err := testSetup.M3DBClient().DefaultSession() + for _, ns := range testSetup.Namespaces() { + writesPeriod0.Write(t, ns.ID(), session) + writesPeriod1.Write(t, ns.ID(), session) + } + log.Info("test data written", zap.Duration("took", time.Since(start))) + + var ( + fsOpts = testSetup.StorageOpts(). + CommitLogOptions(). + FilesystemOptions() + maxWaitTime = time.Minute + ) + for _, ns := range testSetup.Namespaces() { + start := nowFn() + log.Info("waiting for index snapshot files to flush", + zap.Any("ns", ns.ID())) + xclock.WaitUntil(func() bool { + numDocsPerBlockStart, err := getNumDocsPerBlockStart( + ns.ID(), fsOpts, persist.FileSetSnapshotType) + require.NoError(t, err) + totalNumDocs := 0 + for _, numDocs := range numDocsPerBlockStart { + totalNumDocs += numDocs + } + return totalNumDocs == len(writesPeriod0)+len(writesPeriod1) + + }, maxWaitTime) + log.Info("index snapshot files flushed", + zap.Duration("took", time.Since(start)), + zap.Any("ns", ns.ID())) + } + + var ( + newTime = testSetup.NowFn()().Add(blockSize * 2) + ) + log.Info("restarting server with new options") + testSetup.SetNowFn(newTime) + onColdFlush = &testOnColdFlush{ + // Speed cold flushes back up to normal speed. + sleepDuration: 0, + } + storageOpts = testSetup.StorageOpts(). + SetOnColdFlush(onColdFlush) + testSetup.SetStorageOpts(storageOpts) + require.NoError(t, testSetup.StopServer()) + require.NoError(t, testSetup.StartServer()) + log.Info("server is now up") + + for _, ns := range testSetup.Namespaces() { + start := nowFn() + log.Info("waiting for old index snapshot files to be cleaned up", + zap.Any("ns", ns.ID())) + xclock.WaitUntil(func() bool { + numDocsPerBlockStart, err := getNumDocsPerBlockStart( + ns.ID(), fsOpts, persist.FileSetSnapshotType) + require.NoError(t, err) + totalNumDocs := 0 + for _, numDocs := range numDocsPerBlockStart { + totalNumDocs += numDocs + } + return totalNumDocs == 0 + + }, maxWaitTime) + log.Info("index snapshot files cleaned up", + zap.Duration("took", time.Since(start)), + zap.Any("ns", ns.ID())) + } +} + +type indexSnapshotInfo struct { + Info index.IndexVolumeInfo + VolumeIndex int +} + +type testOnColdFlush struct { + sleepDuration time.Duration +} + +func (o *testOnColdFlush) ColdFlushNamespace(ns storage.Namespace) (storage.OnColdFlushNamespace, error) { + return &testOnColdFlushNs{ + sleepDuration: o.sleepDuration, + }, nil +} + +type testOnColdFlushNs struct { + sleepDuration time.Duration +} + +func (o *testOnColdFlushNs) OnFlushNewSeries(event persist.OnFlushNewSeriesEvent) error { + return nil +} + +func (o *testOnColdFlushNs) Done() error { + // Allows injection of artificial lag for testing. + time.Sleep(o.sleepDuration) + return nil +} diff --git a/src/dbnode/integration/fs_bootstrap_index_test.go b/src/dbnode/integration/fs_bootstrap_index_test.go index 714b8935f6..eed5ad7f8f 100644 --- a/src/dbnode/integration/fs_bootstrap_index_test.go +++ b/src/dbnode/integration/fs_bootstrap_index_test.go @@ -157,7 +157,9 @@ func testFilesystemBootstrapIndexWithIndexingEnabled( }) require.NoError(t, writeTestDataToDisk(ns1, setup, seriesMaps, 0)) + require.NoError(t, writeTestIndexToDisk(ns1, setup, seriesMaps)) require.NoError(t, writeTestDataToDisk(ns2, setup, nil, 0)) + require.NoError(t, writeTestIndexToDisk(ns2, setup, nil)) // Start the server with filesystem bootstrapper log := setup.StorageOpts().InstrumentOptions().Logger() diff --git a/src/dbnode/integration/generate/options.go b/src/dbnode/integration/generate/options.go index 186f5dc7f6..90d08acc38 100644 --- a/src/dbnode/integration/generate/options.go +++ b/src/dbnode/integration/generate/options.go @@ -36,6 +36,9 @@ const ( // defaultBlockSize is the default block size defaultBlockSize = 2 * time.Hour + // defaultIndexBlockSize is the default index block size + defaultIndexBlockSize = 2 * time.Hour + // defaultWriterBufferSize is the default buffer size for writing TSDB files defaultWriterBufferSize = 65536 @@ -61,6 +64,7 @@ type options struct { clockOpts clock.Options retentionPeriod time.Duration blockSize time.Duration + indexBlockSize time.Duration filePathPrefix string newFileMode os.FileMode newDirectoryMode os.FileMode @@ -82,6 +86,7 @@ func NewOptions() Options { clockOpts: clock.NewOptions(), retentionPeriod: defaultRetentionPeriod, blockSize: defaultBlockSize, + indexBlockSize: defaultIndexBlockSize, filePathPrefix: defaultFilePathPrefix, newFileMode: defaultNewFileMode, newDirectoryMode: defaultNewDirectoryMode, @@ -122,6 +127,16 @@ func (o *options) BlockSize() time.Duration { return o.blockSize } +func (o *options) SetIndexBlockSize(value time.Duration) Options { + opts := *o + opts.indexBlockSize = value + return &opts +} + +func (o *options) IndexBlockSize() time.Duration { + return o.indexBlockSize +} + func (o *options) SetFilePathPrefix(value string) Options { opts := *o opts.filePathPrefix = value diff --git a/src/dbnode/integration/generate/types.go b/src/dbnode/integration/generate/types.go index 118dd867f7..6ec7dc0d8a 100644 --- a/src/dbnode/integration/generate/types.go +++ b/src/dbnode/integration/generate/types.go @@ -79,6 +79,8 @@ type SeriesBlock []Series type SeriesBlocksByStart map[xtime.UnixNano]SeriesBlock // Writer writes generated data to disk. +// NB(bodu): When writing index data/snapshots to disk, the writer uses the +// block starts of the supplied seriesMaps arg. type Writer interface { // WriteData writes the data as data files. WriteData( @@ -97,6 +99,32 @@ type Writer interface { snapshotInterval time.Duration, ) error + // WriteIndex writes index files for data in series maps. + WriteIndex( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, + ) error + + // WriteIndexWithPredicate writes index files for all data in series maps that + // passes the predicate test. + WriteIndexWithPredicate( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, + pred WriteDatapointPredicate, + ) error + + // WriteIndexSnapshotWithPredicate writes index snapshot files for all data in series maps that + // passes the predicate test. + WriteIndexSnapshotWithPredicate( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, + pred WriteDatapointPredicate, + snapshotInterval time.Duration, + ) error + // WriteDataWithPredicate writes all data that passes the predicate test as data files. WriteDataWithPredicate( nsCtx ns.Context, @@ -137,6 +165,12 @@ type Options interface { // BlockSize returns the blockSize. BlockSize() time.Duration + // SetIndexBlockSize sets the index blockSize. + SetIndexBlockSize(value time.Duration) Options + + // IndexBlockSize returns the index blockSize. + IndexBlockSize() time.Duration + // SetFilePathPrefix sets the file path prefix for sharded TSDB files. SetFilePathPrefix(value string) Options diff --git a/src/dbnode/integration/generate/writer.go b/src/dbnode/integration/generate/writer.go index 419f99e2f1..b475950b69 100644 --- a/src/dbnode/integration/generate/writer.go +++ b/src/dbnode/integration/generate/writer.go @@ -21,6 +21,7 @@ package generate import ( + "errors" "time" "github.com/m3db/m3/src/dbnode/encoding" @@ -28,11 +29,20 @@ import ( "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/sharding" + "github.com/m3db/m3/src/dbnode/storage/index/convert" + "github.com/m3db/m3/src/m3ninx/doc" + "github.com/m3db/m3/src/m3ninx/index/segment/builder" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/checked" "github.com/m3db/m3/src/x/context" xtime "github.com/m3db/m3/src/x/time" ) +var ( + errInvalidFileSetType = errors.New("invalid file set type") +) + type writer struct { opts Options } @@ -69,6 +79,36 @@ func (w *writer) WriteSnapshot( nsCtx, shardSet, seriesMaps, volume, WriteAllPredicate, snapshotInterval) } +func (w *writer) WriteIndex( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, +) error { + return w.WriteIndexWithPredicate( + nsCtx, shardSet, seriesMaps, WriteAllPredicate) +} + +func (w *writer) WriteIndexWithPredicate( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, + pred WriteDatapointPredicate, +) error { + return w.writeIndexWithPredicate( + nsCtx, shardSet, seriesMaps, pred, persist.FileSetFlushType, 0) +} + +func (w *writer) WriteIndexSnapshotWithPredicate( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, + pred WriteDatapointPredicate, + snapshotInterval time.Duration, +) error { + return w.writeIndexWithPredicate( + nsCtx, shardSet, seriesMaps, pred, persist.FileSetSnapshotType, snapshotInterval) +} + func (w *writer) WriteDataWithPredicate( nsCtx ns.Context, shardSet sharding.ShardSet, @@ -151,6 +191,86 @@ func (w *writer) writeWithPredicate( return nil } +func (w *writer) writeIndexWithPredicate( + nsCtx ns.Context, + shardSet sharding.ShardSet, + seriesMaps SeriesBlocksByStart, + pred WriteDatapointPredicate, + fileSetType persist.FileSetType, + snapshotInterval time.Duration, +) error { + gOpts := w.opts + writer, err := fs.NewIndexWriter(fs.NewOptions(). + SetFilePathPrefix(gOpts.FilePathPrefix()). + SetWriterBufferSize(gOpts.WriterBufferSize()). + SetNewFileMode(gOpts.NewFileMode()). + SetNewDirectoryMode(gOpts.NewDirectoryMode())) + if err != nil { + return err + } + shardsMap := make(map[uint32]struct{}) + for _, shard := range shardSet.AllIDs() { + shardsMap[shard] = struct{}{} + } + // Convert series maps to series per block after applying predicate. + docsPerBlockStart := make(map[xtime.UnixNano][]doc.Document) + for start, data := range seriesMaps { + for _, series := range data { + var found bool + for _, dp := range series.Data { + if pred(dp) { + found = true + break + } + } + if found { + doc, err := convert.FromSeriesIDAndTags(series.ID, series.Tags) + if err != nil { + return err + } + docsPerBlockStart[start] = append(docsPerBlockStart[start], doc) + } + } + } + + var ( + indexBlockSize = gOpts.IndexBlockSize() + currStart = xtime.ToUnixNano(gOpts.ClockOptions().NowFn()().Truncate(indexBlockSize)) + ) + for start, docs := range docsPerBlockStart { + var indexVolumeType idxpersist.IndexVolumeType + switch fileSetType { + case persist.FileSetFlushType: + indexVolumeType = idxpersist.DefaultIndexVolumeType + case persist.FileSetSnapshotType: + indexVolumeType = idxpersist.SnapshotColdIndexVolumeType + if start.Equal(currStart) || start.After(currStart) { + indexVolumeType = idxpersist.SnapshotWarmIndexVolumeType + } + default: + return errInvalidFileSetType + } + + if err := writeIndexToDisk( + nsCtx, + writer, + shardsMap, + indexBlockSize, + start.ToTime(), + snapshotInterval, + gOpts.FilePathPrefix(), + indexVolumeType, + fileSetType, + docs, + ); err != nil { + return err + } + } + + // No-op for empty start periods since commit log bootstrap fulfills all requested ranges if successful. + return nil +} + func writeToDiskWithPredicate( writer fs.DataFileSetWriter, shardSet sharding.ShardSet, @@ -235,3 +355,64 @@ func writeToDiskWithPredicate( return nil } + +func writeIndexToDisk( + nsCtx ns.Context, + writer fs.IndexFileSetWriter, + shardsMap map[uint32]struct{}, + blockSize time.Duration, + start time.Time, + snapshotInterval time.Duration, + filePathPrefix string, + indexVolumeType idxpersist.IndexVolumeType, + fileSetType persist.FileSetType, + docs []doc.Document, +) error { + volumeIndex, err := fs.NextIndexFileSetVolumeIndex( + filePathPrefix, + nsCtx.ID, + start, + ) + if err != nil { + return err + } + writerOpts := fs.IndexWriterOpenOptions{ + Identifier: fs.FileSetFileIdentifier{ + Namespace: nsCtx.ID, + BlockStart: start, + VolumeIndex: volumeIndex, + FileSetContentType: persist.FileSetIndexContentType, + }, + FileSetType: fileSetType, + BlockSize: blockSize, + Shards: shardsMap, + IndexVolumeType: indexVolumeType, + Snapshot: fs.IndexWriterSnapshotOptions{ + SnapshotTime: start.Add(snapshotInterval), + }, + } + if err := writer.Open(writerOpts); err != nil { + return err + } + segmentWriter, err := idxpersist.NewMutableSegmentFileSetWriter(fst.WriterOptions{}) + if err != nil { + return err + } + builder, err := builder.NewBuilderFromDocuments(builder.NewOptions()) + for _, doc := range docs { + _, err = builder.Insert(doc) + if err != nil { + return err + } + } + if err := segmentWriter.Reset(builder); err != nil { + return err + } + if err := writer.WriteSegmentFileSet(segmentWriter); err != nil { + return err + } + if err := builder.Close(); err != nil { + return err + } + return writer.Close() +} diff --git a/src/dbnode/integration/integration.go b/src/dbnode/integration/integration.go index 31883a58fb..d04cccf80a 100644 --- a/src/dbnode/integration/integration.go +++ b/src/dbnode/integration/integration.go @@ -339,7 +339,7 @@ func writeTestDataToDisk( volume int, ) error { ropts := metadata.Options().RetentionOptions() - writer := generate.NewWriter(setup.GeneratorOptions(ropts)) + writer := generate.NewWriter(setup.GeneratorOptions(ropts, metadata.Options().IndexOptions())) return writer.WriteData(namespace.NewContextFrom(metadata), setup.ShardSet(), seriesMaps, volume) } @@ -352,11 +352,34 @@ func writeTestSnapshotsToDiskWithPredicate( snapshotInterval time.Duration, ) error { ropts := metadata.Options().RetentionOptions() - writer := generate.NewWriter(setup.GeneratorOptions(ropts)) + writer := generate.NewWriter(setup.GeneratorOptions(ropts, metadata.Options().IndexOptions())) return writer.WriteSnapshotWithPredicate( namespace.NewContextFrom(metadata), setup.ShardSet(), seriesMaps, volume, pred, snapshotInterval) } +func writeTestIndexToDisk( + metadata namespace.Metadata, + setup TestSetup, + seriesMaps generate.SeriesBlocksByStart, +) error { + ropts := metadata.Options().RetentionOptions() + writer := generate.NewWriter(setup.GeneratorOptions(ropts, metadata.Options().IndexOptions())) + return writer.WriteIndex(namespace.NewContextFrom(metadata), setup.ShardSet(), seriesMaps) +} + +func writeTestIndexSnapshotsToDiskWithPredicate( + metadata namespace.Metadata, + setup TestSetup, + seriesMaps generate.SeriesBlocksByStart, + pred generate.WriteDatapointPredicate, + snapshotInterval time.Duration, +) error { + ropts := metadata.Options().RetentionOptions() + writer := generate.NewWriter(setup.GeneratorOptions(ropts, metadata.Options().IndexOptions())) + return writer.WriteIndexSnapshotWithPredicate( + namespace.NewContextFrom(metadata), setup.ShardSet(), seriesMaps, pred, snapshotInterval) +} + func concatShards(a, b shard.Shards) shard.Shards { all := append(a.All(), b.All()...) return shard.NewShards(all) diff --git a/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go b/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go index 10a2d2890b..7cae162dcb 100644 --- a/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go +++ b/src/dbnode/integration/peers_bootstrap_index_aggregate_test.go @@ -55,7 +55,8 @@ func TestPeersBootstrapIndexAggregateQuery(t *testing.T) { SetBlockSize(2 * blockSize) nOpts := namespace.NewOptions(). SetRetentionOptions(rOpts). - SetIndexOptions(idxOpts) + SetIndexOptions(idxOpts). + SetColdWritesEnabled(true) ns1, err := namespace.NewMetadata(testNamespaces[0], nOpts) require.NoError(t, err) opts := NewTestOptions(t). diff --git a/src/dbnode/integration/peers_bootstrap_index_test.go b/src/dbnode/integration/peers_bootstrap_index_test.go index d5907d9dc1..5a40197042 100644 --- a/src/dbnode/integration/peers_bootstrap_index_test.go +++ b/src/dbnode/integration/peers_bootstrap_index_test.go @@ -29,11 +29,13 @@ import ( indexpb "github.com/m3db/m3/src/dbnode/generated/proto/index" "github.com/m3db/m3/src/dbnode/integration/generate" "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/storage/index" "github.com/m3db/m3/src/m3ninx/generated/proto/fswriter" "github.com/m3db/m3/src/m3ninx/idx" + xclock "github.com/m3db/m3/src/x/clock" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" xtime "github.com/m3db/m3/src/x/time" @@ -60,7 +62,8 @@ func TestPeersBootstrapIndexWithIndexingEnabled(t *testing.T) { SetBlockSize(blockSize) nOpts := namespace.NewOptions(). SetRetentionOptions(rOpts). - SetIndexOptions(idxOpts) + SetIndexOptions(idxOpts). + SetColdWritesEnabled(true) ns1, err := namespace.NewMetadata(testNamespaces[0], nOpts) require.NoError(t, err) opts := NewTestOptions(t). @@ -155,6 +158,24 @@ func TestPeersBootstrapIndexWithIndexingEnabled(t *testing.T) { verifySeriesMaps(t, setup, ns1.ID(), seriesMaps) } + // Ensure that the index data for qux has been written to disk for node-0 by the warm flush lifecycle. + // This means this data is not initially present but will ultimately end up on disk due after the + // warm flush lifecycle completes. We encounter this case only when the node crashes between a warm data flush + // and a warm index flush. + xclock.WaitUntil(func() bool { + numDocsPerBlockStart, err := getNumDocsPerBlockStart( + ns1.ID(), + setups[0].FilesystemOpts(), + persist.FileSetFlushType, + ) + require.NoError(t, err) + numDocs, ok := numDocsPerBlockStart[xtime.ToUnixNano(now.Add(-2*blockSize).Truncate(blockSize))] + if !ok { + return false + } + return numDocs == 1 + }, time.Minute) + // Issue some index queries to the second node which bootstrapped the metadata session, err := setups[1].M3DBClient().DefaultSession() require.NoError(t, err) @@ -191,10 +212,12 @@ func TestPeersBootstrapIndexWithIndexingEnabled(t *testing.T) { expected: []generate.Series{barSeries, bazSeries, quxSeries}, }) - // Ensure that the index data for qux has been written to disk. + // Ensure that the index data for qux has been written to disk for node-1 by the peers bootstrapper. + // This means this data should show up once node-1 has been successfully bootstrapped. numDocsPerBlockStart, err := getNumDocsPerBlockStart( ns1.ID(), setups[1].FilesystemOpts(), + persist.FileSetFlushType, ) require.NoError(t, err) numDocs, ok := numDocsPerBlockStart[xtime.ToUnixNano(now.Add(-2*blockSize).Truncate(blockSize))] @@ -210,12 +233,14 @@ type indexInfo struct { func getNumDocsPerBlockStart( nsID ident.ID, fsOpts fs.Options, + fileType persist.FileSetType, ) (map[xtime.UnixNano]int, error) { numDocsPerBlockStart := make(map[xtime.UnixNano]int) infoFiles := fs.ReadIndexInfoFiles( fsOpts.FilePathPrefix(), nsID, fsOpts.InfoReaderBufferSize(), + fileType, ) // Grab the latest index info file for each blockstart. latestIndexInfoPerBlockStart := make(map[xtime.UnixNano]indexInfo) diff --git a/src/dbnode/integration/setup.go b/src/dbnode/integration/setup.go index cf02d7c939..e4eea09ef4 100644 --- a/src/dbnode/integration/setup.go +++ b/src/dbnode/integration/setup.go @@ -172,7 +172,7 @@ type TestSetup interface { BlockLeaseManager() block.LeaseManager ShardSet() sharding.ShardSet SetShardSet(sharding.ShardSet) - GeneratorOptions(retention.Options) generate.Options + GeneratorOptions(retention.Options, namespace.IndexOptions) generate.Options MaybeResetClients() error SchemaRegistry() namespace.SchemaRegistry NamespaceMetadataOrFail(ident.ID) namespace.Metadata @@ -625,7 +625,10 @@ func (ts *testSetup) NamespaceMetadataOrFail(id ident.ID) namespace.Metadata { return nil } -func (ts *testSetup) GeneratorOptions(ropts retention.Options) generate.Options { +func (ts *testSetup) GeneratorOptions( + ropts retention.Options, + indexOpts namespace.IndexOptions, +) generate.Options { var ( storageOpts = ts.storageOpts fsOpts = storageOpts.CommitLogOptions().FilesystemOptions() @@ -637,6 +640,7 @@ func (ts *testSetup) GeneratorOptions(ropts retention.Options) generate.Options SetClockOptions(co). SetRetentionPeriod(ropts.RetentionPeriod()). SetBlockSize(ropts.BlockSize()). + SetIndexBlockSize(indexOpts.BlockSize()). SetFilePathPrefix(fsOpts.FilePathPrefix()). SetNewFileMode(fsOpts.NewFileMode()). SetNewDirectoryMode(fsOpts.NewDirectoryMode()). diff --git a/src/dbnode/persist/fs/files.go b/src/dbnode/persist/fs/files.go index 1de9a425d6..02fa2008d5 100644 --- a/src/dbnode/persist/fs/files.go +++ b/src/dbnode/persist/fs/files.go @@ -51,13 +51,15 @@ var ( errSnapshotTimeAndIDZero = errors.New("tried to read snapshot time and ID of zero value") errNonSnapshotFileset = errors.New("tried to determine snapshot time and id of non-snapshot") + errInvalidContentType = errors.New("invalid content type") ) const ( - dataDirName = "data" - indexDirName = "index" - snapshotDirName = "snapshots" - commitLogsDirName = "commitlogs" + dataDirName = "data" + indexDirName = "index" + snapshotDirName = "snapshots" + snapshotIndexDirName = "snapshots_index" + commitLogsDirName = "commitlogs" // The maximum number of delimeters ('-' or '.') that is expected in a // (base) filename. @@ -595,30 +597,49 @@ func timeAndIndexFromFileName(fname string, componentPosition int) (time.Time, i // SnapshotTimeAndID returns the metadata for the snapshot. func SnapshotTimeAndID( filePathPrefix string, id FileSetFileIdentifier) (time.Time, uuid.UUID, error) { - decoder := msgpack.NewDecoder(nil) - return snapshotTimeAndID(filePathPrefix, id, decoder) -} - -func snapshotTimeAndID( - filePathPrefix string, - id FileSetFileIdentifier, - decoder *msgpack.Decoder, -) (time.Time, uuid.UUID, error) { infoBytes, err := readSnapshotInfoFile(filePathPrefix, id, defaultBufioReaderSize) if err != nil { return time.Time{}, nil, fmt.Errorf("error reading snapshot info file: %v", err) } + switch id.FileSetContentType { + case persist.FileSetDataContentType: + return dataSnapshotTimeAndID(infoBytes) + case persist.FileSetIndexContentType: + return indexSnapshotTimeAndID(infoBytes) + } + return time.Time{}, nil, errInvalidContentType +} +func dataSnapshotTimeAndID( + infoBytes []byte, +) (time.Time, uuid.UUID, error) { + decoder := msgpack.NewDecoder(nil) decoder.Reset(msgpack.NewByteDecoderStream(infoBytes)) info, err := decoder.DecodeIndexInfo() if err != nil { - return time.Time{}, nil, fmt.Errorf("error decoding snapshot info file: %v", err) + return time.Time{}, nil, fmt.Errorf("error decoding data snapshot info file: %v", err) } var parsedSnapshotID uuid.UUID err = parsedSnapshotID.UnmarshalBinary(info.SnapshotID) if err != nil { - return time.Time{}, nil, fmt.Errorf("error parsing snapshot ID from snapshot info file: %v", err) + return time.Time{}, nil, fmt.Errorf("error parsing snapshot ID from data snapshot info file: %v", err) + } + + return time.Unix(0, info.SnapshotTime), parsedSnapshotID, nil +} + +func indexSnapshotTimeAndID( + infoBytes []byte, +) (time.Time, uuid.UUID, error) { + var info index.IndexVolumeInfo + if err := info.Unmarshal(infoBytes); err != nil { + return time.Time{}, nil, err + } + var parsedSnapshotID uuid.UUID + err := parsedSnapshotID.UnmarshalBinary(info.SnapshotID) + if err != nil { + return time.Time{}, nil, fmt.Errorf("error parsing snapshot ID from index snapshot info file: %v", err) } return time.Unix(0, info.SnapshotTime), parsedSnapshotID, nil @@ -626,11 +647,36 @@ func snapshotTimeAndID( func readSnapshotInfoFile(filePathPrefix string, id FileSetFileIdentifier, readerBufferSize int) ([]byte, error) { var ( - shardDir = ShardSnapshotsDirPath(filePathPrefix, id.Namespace, id.Shard) - checkpointFilePath = filesetPathFromTimeAndIndex(shardDir, id.BlockStart, id.VolumeIndex, checkpointFileSuffix) + dir string + infoDigestFromDataFn func(data []byte) (uint32, error) + ) + switch id.FileSetContentType { + case persist.FileSetDataContentType: + dir = ShardSnapshotsDirPath(filePathPrefix, id.Namespace, id.Shard) + infoDigestFromDataFn = func(data []byte) (uint32, error) { + buf, err := digest.ToBuffer(data) + if err != nil { + return 0, err + } + return buf.ReadDigest(), nil + } + case persist.FileSetIndexContentType: + dir = NamespaceIndexSnapshotDirPath(filePathPrefix, id.Namespace) + infoDigestFromDataFn = func(data []byte) (uint32, error) { + var indexDigest index.IndexDigests + if err := indexDigest.Unmarshal(data); err != nil { + return 0, err + } + return indexDigest.InfoDigest, nil + } - digestFilePath = filesetPathFromTimeAndIndex(shardDir, id.BlockStart, id.VolumeIndex, digestFileSuffix) - infoFilePath = filesetPathFromTimeAndIndex(shardDir, id.BlockStart, id.VolumeIndex, infoFileSuffix) + default: + return nil, errInvalidContentType + } + var ( + checkpointFilePath = filesetPathFromTimeAndIndex(dir, id.BlockStart, id.VolumeIndex, checkpointFileSuffix) + digestFilePath = filesetPathFromTimeAndIndex(dir, id.BlockStart, id.VolumeIndex, digestFileSuffix) + infoFilePath = filesetPathFromTimeAndIndex(dir, id.BlockStart, id.VolumeIndex, infoFileSuffix) ) checkpointFd, err := os.Open(checkpointFilePath) @@ -657,7 +703,10 @@ func readSnapshotInfoFile(filePathPrefix string, id FileSetFileIdentifier, reade } // Read and validate the info file - expectedInfoDigest := digest.ToBuffer(digestData).ReadDigest() + expectedInfoDigest, err := infoDigestFromDataFn(digestData) + if err != nil { + return nil, err + } return readAndValidate( infoFilePath, readerBufferSize, expectedInfoDigest) } @@ -785,7 +834,11 @@ func forEachInfoFile( var expectedInfoDigest uint32 switch args.contentType { case persist.FileSetDataContentType: - expectedInfoDigest = digest.ToBuffer(digestData).ReadDigest() + buf, err := digest.ToBuffer(digestData) + if err != nil { + continue + } + expectedInfoDigest = buf.ReadDigest() case persist.FileSetIndexContentType: if err := indexDigests.Unmarshal(digestData); err != nil { continue @@ -885,11 +938,12 @@ func ReadIndexInfoFiles( filePathPrefix string, namespace ident.ID, readerBufferSize int, + fileSetType persist.FileSetType, ) []ReadIndexInfoFileResult { var infoFileResults []ReadIndexInfoFileResult forEachInfoFile( forEachInfoFileSelector{ - fileSetType: persist.FileSetFlushType, + fileSetType: fileSetType, contentType: persist.FileSetIndexContentType, filePathPrefix: filePathPrefix, namespace: namespace, @@ -922,7 +976,7 @@ func SortedSnapshotMetadataFiles(opts Options) ( []SnapshotMetadata, []SnapshotMetadataErrorWithPaths, error) { var ( prefix = opts.FilePathPrefix() - snapshotsDirPath = SnapshotDirPath(prefix) + snapshotsDirPath = SnapshotsDirPath(prefix) ) // Glob for metadata files directly instead of their checkpoint files. @@ -1312,6 +1366,9 @@ func filesetFiles(args filesetFilesSelector) (FileSetFilesSlice, error) { BlockStart: currentFileBlockStart, Shard: args.shard, VolumeIndex: volumeIndex, + // FileSetContentType is used to determine which dir to read from + // so we populate it here since it is not written to disk. + FileSetContentType: args.contentType, }, args.filePathPrefix) } else if !currentFileBlockStart.Equal(latestBlockStart) || latestVolumeIndex != volumeIndex { filesetFiles = append(filesetFiles, latestFileSetFile) @@ -1320,6 +1377,9 @@ func filesetFiles(args filesetFilesSelector) (FileSetFilesSlice, error) { BlockStart: currentFileBlockStart, Shard: args.shard, VolumeIndex: volumeIndex, + // FileSetContentType is used to determine which dir to read from + // so we populate it here since it is not written to disk. + FileSetContentType: args.contentType, }, args.filePathPrefix) } latestBlockStart = currentFileBlockStart @@ -1399,11 +1459,6 @@ func IndexDataDirPath(prefix string) string { return path.Join(prefix, indexDirName, dataDirName) } -// SnapshotDirPath returns the path to the snapshot directory belong to a db -func SnapshotDirPath(prefix string) string { - return path.Join(prefix, snapshotDirName) -} - // NamespaceDataDirPath returns the path to the data directory for a given namespace. func NamespaceDataDirPath(prefix string, namespace ident.ID) string { return path.Join(prefix, dataDirName, namespace.String()) @@ -1419,9 +1474,9 @@ func NamespaceIndexDataDirPath(prefix string, namespace ident.ID) string { return path.Join(prefix, indexDirName, dataDirName, namespace.String()) } -// NamespaceIndexSnapshotDirPath returns the path to the data directory for a given namespace. +// NamespaceIndexSnapshotDirPath returns the path to the index snapshots directory for a given namespace. func NamespaceIndexSnapshotDirPath(prefix string, namespace ident.ID) string { - return path.Join(prefix, indexDirName, snapshotDirName, namespace.String()) + return path.Join(IndexSnapshotsDirPath(prefix), namespace.String()) } // SnapshotsDirPath returns the path to the snapshots directory. @@ -1429,6 +1484,11 @@ func SnapshotsDirPath(prefix string) string { return path.Join(prefix, snapshotDirName) } +// IndexSnapshotsDirPath returns the path to the index snapshots directory. +func IndexSnapshotsDirPath(prefix string) string { + return path.Join(prefix, snapshotIndexDirName) +} + // ShardDataDirPath returns the path to the data directory for a given shard. func ShardDataDirPath(prefix string, namespace ident.ID, shard uint32) string { namespacePath := NamespaceDataDirPath(prefix, namespace) @@ -1578,9 +1638,8 @@ func NextIndexSnapshotFileIndex(filePathPrefix string, namespace ident.ID, block var currentSnapshotIndex = -1 for _, snapshot := range snapshotFiles { - if snapshot.ID.BlockStart.Equal(blockStart) { + if snapshot.ID.BlockStart.Equal(blockStart) && snapshot.ID.VolumeIndex > currentSnapshotIndex { currentSnapshotIndex = snapshot.ID.VolumeIndex - break } } diff --git a/src/dbnode/persist/fs/files_test.go b/src/dbnode/persist/fs/files_test.go index 60a4b68dd0..754408fcae 100644 --- a/src/dbnode/persist/fs/files_test.go +++ b/src/dbnode/persist/fs/files_test.go @@ -896,8 +896,12 @@ func TestSnapshotFileHasCompleteCheckpointFile(t *testing.T) { require.Equal(t, false, f.HasCompleteCheckpointFile()) } -func TestSnapshotDirPath(t *testing.T) { - require.Equal(t, "prefix/snapshots", SnapshotDirPath("prefix")) +func TestSnapshotsDirPath(t *testing.T) { + require.Equal(t, "prefix/snapshots", SnapshotsDirPath("prefix")) +} + +func TestIndexSnapshotsDirPath(t *testing.T) { + require.Equal(t, "prefix/snapshots_index", IndexSnapshotsDirPath("prefix")) } func TestNamespaceSnapshotsDirPath(t *testing.T) { diff --git a/src/dbnode/persist/fs/fs_mock.go b/src/dbnode/persist/fs/fs_mock.go index 11da9e4812..53a42f31df 100644 --- a/src/dbnode/persist/fs/fs_mock.go +++ b/src/dbnode/persist/fs/fs_mock.go @@ -35,6 +35,7 @@ import ( "github.com/m3db/m3/src/dbnode/sharding" "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/dbnode/x/xio" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" persist0 "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/checked" "github.com/m3db/m3/src/x/context" @@ -806,6 +807,20 @@ func (mr *MockIndexSegmentFileSetWriterMockRecorder) SegmentMetadata() *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentMetadata", reflect.TypeOf((*MockIndexSegmentFileSetWriter)(nil).SegmentMetadata)) } +// SegmentState mocks base method +func (m *MockIndexSegmentFileSetWriter) SegmentState() fst.IndexSegmentState { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentState") + ret0, _ := ret[0].(fst.IndexSegmentState) + return ret0 +} + +// SegmentState indicates an expected call of SegmentState +func (mr *MockIndexSegmentFileSetWriterMockRecorder) SegmentState() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentState", reflect.TypeOf((*MockIndexSegmentFileSetWriter)(nil).SegmentState)) +} + // SegmentType mocks base method func (m *MockIndexSegmentFileSetWriter) SegmentType() persist0.IndexSegmentType { m.ctrl.T.Helper() diff --git a/src/dbnode/persist/fs/index_read_write_test.go b/src/dbnode/persist/fs/index_read_write_test.go index f2049a1260..d7a1634404 100644 --- a/src/dbnode/persist/fs/index_read_write_test.go +++ b/src/dbnode/persist/fs/index_read_write_test.go @@ -34,6 +34,7 @@ import ( "time" "github.com/m3db/m3/src/dbnode/persist" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/ident" @@ -217,6 +218,7 @@ type testIndexSegment struct { minorVersion int metadata []byte files []testIndexSegmentFile + state fst.IndexSegmentState } type testIndexSegmentFile struct { @@ -236,6 +238,7 @@ func writeTestIndexSegments( fileSet.EXPECT().MajorVersion().Return(s.majorVersion) fileSet.EXPECT().MinorVersion().Return(s.minorVersion) fileSet.EXPECT().SegmentMetadata().Return(s.metadata) + fileSet.EXPECT().SegmentState().Return(s.state) var files []idxpersist.IndexSegmentFileType for _, f := range s.files { diff --git a/src/dbnode/persist/fs/index_write.go b/src/dbnode/persist/fs/index_write.go index b042d5dd39..9634a99b65 100644 --- a/src/dbnode/persist/fs/index_write.go +++ b/src/dbnode/persist/fs/index_write.go @@ -31,10 +31,12 @@ import ( "github.com/m3db/m3/src/dbnode/digest" "github.com/m3db/m3/src/dbnode/generated/proto/index" "github.com/m3db/m3/src/dbnode/persist" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" idxpersist "github.com/m3db/m3/src/m3ninx/persist" xerrors "github.com/m3db/m3/src/x/errors" protobuftypes "github.com/gogo/protobuf/types" + "github.com/pborman/uuid" ) const ( @@ -65,6 +67,7 @@ type indexWriter struct { start time.Time fileSetType persist.FileSetType snapshotTime time.Time + snapshotID uuid.UUID volumeIndex int indexVolumeType idxpersist.IndexVolumeType shards map[uint32]struct{} @@ -78,6 +81,7 @@ type indexWriter struct { type writtenIndexSegment struct { segmentType idxpersist.IndexSegmentType + segmentState fst.IndexSegmentState majorVersion int minorVersion int metadata []byte @@ -119,6 +123,7 @@ func (w *indexWriter) Open(opts IndexWriterOpenOptions) error { w.volumeIndex = opts.Identifier.VolumeIndex w.shards = opts.Shards w.snapshotTime = opts.Snapshot.SnapshotTime + w.snapshotID = opts.Snapshot.SnapshotID w.indexVolumeType = opts.IndexVolumeType if w.indexVolumeType == "" { w.indexVolumeType = idxpersist.DefaultIndexVolumeType @@ -169,6 +174,7 @@ func (w *indexWriter) WriteSegmentFileSet( majorVersion: segmentFileSet.MajorVersion(), minorVersion: segmentFileSet.MinorVersion(), metadata: segmentFileSet.SegmentMetadata(), + segmentState: segmentFileSet.SegmentState(), } files := segmentFileSet.Files() @@ -237,6 +243,10 @@ func (w *indexWriter) infoFileData() ([]byte, error) { for shard := range w.shards { shards = append(shards, shard) } + snapshotIDBytes, err := w.snapshotID.MarshalBinary() + if err != nil { + return nil, err + } info := &index.IndexVolumeInfo{ MajorVersion: indexFileSetMajorVersion, BlockStart: w.start.UnixNano(), @@ -244,6 +254,7 @@ func (w *indexWriter) infoFileData() ([]byte, error) { FileType: int64(w.fileSetType), Shards: shards, SnapshotTime: w.snapshotTime.UnixNano(), + SnapshotID: snapshotIDBytes, IndexVolumeType: &protobuftypes.StringValue{ Value: string(w.indexVolumeType), }, @@ -251,6 +262,7 @@ func (w *indexWriter) infoFileData() ([]byte, error) { for _, segment := range w.segments { segmentInfo := &index.SegmentInfo{ SegmentType: string(segment.segmentType), + SegmentState: index.SegmentState(segment.segmentState), MajorVersion: int64(segment.majorVersion), MinorVersion: int64(segment.minorVersion), Metadata: segment.metadata, diff --git a/src/dbnode/persist/fs/persist_manager.go b/src/dbnode/persist/fs/persist_manager.go index 752e48f656..e703e22a72 100644 --- a/src/dbnode/persist/fs/persist_manager.go +++ b/src/dbnode/persist/fs/persist_manager.go @@ -32,6 +32,7 @@ import ( "github.com/m3db/m3/src/dbnode/runtime" "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" m3ninxfs "github.com/m3db/m3/src/m3ninx/index/segment/fst" m3ninxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/checked" @@ -53,16 +54,19 @@ const ( persistManagerIdle persistManagerStatus = iota persistManagerPersistingData persistManagerPersistingIndex + persistManagerPersistingSnapshot ) var ( - errPersistManagerNotIdle = errors.New("persist manager cannot start persist, not idle") - errPersistManagerNotPersisting = errors.New("persist manager cannot finish persisting, not persisting") - errPersistManagerCannotPrepareDataNotPersisting = errors.New("persist manager cannot prepare data, not persisting") - errPersistManagerCannotPrepareIndexNotPersisting = errors.New("persist manager cannot prepare index, not persisting") - errPersistManagerFileSetAlreadyExists = errors.New("persist manager cannot prepare, fileset already exists") - errPersistManagerCannotDoneSnapshotNotSnapshot = errors.New("persist manager cannot done snapshot, file set type is not snapshot") - errPersistManagerCannotDoneFlushNotFlush = errors.New("persist manager cannot done flush, file set type is not flush") + errPersistManagerNotIdle = errors.New("persist manager cannot start persist, not idle") + errPersistManagerNotPersisting = errors.New("persist manager cannot finish persisting, not persisting") + errPersistManagerCannotPrepareDataNotPersisting = errors.New("persist manager cannot prepare data, not persisting") + errPersistManagerCannotPrepareDataNotPersistingSnapshot = errors.New("persist manager cannot prepare data snapshot, not persisting snapshot") + errPersistManagerCannotPrepareIndexNotPersisting = errors.New("persist manager cannot prepare index, not persisting") + errPersistManagerCannotPrepareIndexNotPersistingSnapshot = errors.New("persist manager cannot prepare index, not persisting snapshot") + errPersistManagerFileSetAlreadyExists = errors.New("persist manager cannot prepare, fileset already exists") + errPersistManagerCannotDoneSnapshotNotSnapshot = errors.New("persist manager cannot done snapshot, file set type is not snapshot") + errPersistManagerCannotDoneFlushNotFlush = errors.New("persist manager cannot done flush, file set type is not flush") ) type sleepFn func(time.Duration) @@ -116,8 +120,9 @@ type dataPersistManager struct { } type indexPersistManager struct { - writer IndexFileSetWriter - segmentWriter m3ninxpersist.MutableSegmentFileSetWriter + writer IndexFileSetWriter + segmentWriter m3ninxpersist.MutableSegmentFileSetWriter + segmentDataWriter m3ninxpersist.FSTSegmentDataFileSetWriter // identifiers required to know which file to open // after persistence is over @@ -131,6 +136,9 @@ type indexPersistManager struct { // hooks used for testing newReaderFn newIndexReaderFn newPersistentSegmentFn newPersistentSegmentFn + + // The ID of the snapshot being prepared. Only used when writing out snapshots. + snapshotID uuid.UUID } type newIndexReaderFn func(Options) (IndexFileSetReader, error) @@ -174,6 +182,11 @@ func NewPersistManager(opts Options) (persist.Manager, error) { return nil, err } + segmentDataWriter, err := m3ninxpersist.NewFSTSegmentDataFileSetWriter() + if err != nil { + return nil, err + } + pm := &persistManager{ opts: opts, filePathPrefix: filePathPrefix, @@ -186,8 +199,9 @@ func NewPersistManager(opts Options) (persist.Manager, error) { snapshotMetadataWriter: NewSnapshotMetadataWriter(opts), }, indexPM: indexPersistManager{ - writer: idxWriter, - segmentWriter: segmentWriter, + writer: idxWriter, + segmentWriter: segmentWriter, + segmentDataWriter: segmentDataWriter, }, status: persistManagerIdle, metrics: newPersistManagerMetrics(scope), @@ -207,9 +221,11 @@ func (pm *persistManager) reset() { pm.worked = 0 pm.slept = 0 pm.indexPM.segmentWriter.Reset(nil) + pm.indexPM.segmentDataWriter.Reset(fst.SegmentData{}) pm.indexPM.writeErr = nil pm.indexPM.initialized = false pm.dataPM.snapshotID = nil + pm.indexPM.snapshotID = nil } // StartIndexPersist is called by the databaseFlushManager to begin the persist process for @@ -226,23 +242,22 @@ func (pm *persistManager) StartIndexPersist() (persist.IndexFlush, error) { return pm, nil } -// PrepareIndex returns a prepared persist object which can be used to persist index data. -func (pm *persistManager) PrepareIndex(opts persist.IndexPrepareOptions) (persist.PreparedIndexPersist, error) { +// PrepareIndexFlush returns a prepared persist object which can be used to persist index flush data. +func (pm *persistManager) PrepareIndexFlush(opts persist.IndexPrepareOptions) (persist.PreparedIndexFlushPersist, error) { var ( nsMetadata = opts.NamespaceMetadata blockStart = opts.BlockStart nsID = opts.NamespaceMetadata.ID() - prepared persist.PreparedIndexPersist + prepared persist.PreparedIndexFlushPersist ) - // only support persistence of index flush files for now if opts.FileSetType != persist.FileSetFlushType { - return prepared, fmt.Errorf("unable to PrepareIndex, unsupported file set type: %v", opts.FileSetType) + return prepared, fmt.Errorf("unable to PrepareIndexFlush, unsupported file set type: %v", opts.FileSetType) } // ensure namespace has indexing enabled if !nsMetadata.Options().IndexOptions().Enabled() { - return prepared, fmt.Errorf("unable to PrepareIndex, namespace %s does not have indexing enabled", nsID.String()) + return prepared, fmt.Errorf("unable to PrepareIndexFlush, namespace %s does not have indexing enabled", nsID.String()) } // ensure StartIndexPersist has been called @@ -292,9 +307,9 @@ func (pm *persistManager) PrepareIndex(opts persist.IndexPrepareOptions) (persis pm.indexPM.fileSetType = opts.FileSetType pm.indexPM.initialized = true - // provide persistManager hooks into PreparedIndexPersist object + // provide persistManager hooks into PreparedIndexFlushPersist object prepared.Persist = pm.persistIndex - prepared.Close = pm.closeIndex + prepared.Close = pm.closeIndexAndReadIndexSegments return prepared, nil } @@ -321,26 +336,126 @@ func (pm *persistManager) persistIndex(builder segment.Builder) error { return nil } -func (pm *persistManager) closeIndex() ([]segment.Segment, error) { +// PrepareIndexSnapshot returns a prepared persist object which can be used to persist index snapshot data. +func (pm *persistManager) PrepareIndexSnapshot(opts persist.IndexPrepareSnapshotOptions) (persist.PreparedIndexSnapshotPersist, error) { + var ( + nsMetadata = opts.NamespaceMetadata + blockStart = opts.BlockStart + snapshotTime = opts.SnapshotTime + snapshotID = pm.indexPM.snapshotID + nsID = opts.NamespaceMetadata.ID() + prepared persist.PreparedIndexSnapshotPersist + ) + + if opts.FileSetType != persist.FileSetSnapshotType { + return prepared, fmt.Errorf("unable to PrepareIndexSnapshot, unsupported file set type: %v", opts.FileSetType) + } + + // ensure namespace has indexing enabled + if !nsMetadata.Options().IndexOptions().Enabled() { + return prepared, fmt.Errorf("unable to PrepareIndexSnapshot, namespace %s does not have indexing enabled", nsID.String()) + } + + // ensure StartSnapshotPersist has been called + pm.RLock() + status := pm.status + pm.RUnlock() + + // ensure StartIndexPersist has been called + if status != persistManagerPersistingSnapshot { + return prepared, errPersistManagerCannotPrepareIndexNotPersistingSnapshot + } + + // work out the volume index for the next Index Snapshot FileSetFile for the given namespace/blockstart + volumeIndex, err := NextIndexSnapshotFileIndex(pm.opts.FilePathPrefix(), nsMetadata.ID(), blockStart) + if err != nil { + return prepared, err + } + + // we now have all the identifier needed to uniquely specificy a single Index FileSetFile on disk. + fileSetID := FileSetFileIdentifier{ + FileSetContentType: persist.FileSetIndexContentType, + Namespace: nsID, + BlockStart: blockStart, + VolumeIndex: volumeIndex, + } + blockSize := nsMetadata.Options().IndexOptions().BlockSize() + idxWriterOpts := IndexWriterOpenOptions{ + BlockSize: blockSize, + FileSetType: opts.FileSetType, + Identifier: fileSetID, + Shards: opts.Shards, + IndexVolumeType: opts.IndexVolumeType, + Snapshot: IndexWriterSnapshotOptions{ + SnapshotTime: snapshotTime, + SnapshotID: snapshotID, + }, + } + + // create writer for required fileset file. + if err := pm.indexPM.writer.Open(idxWriterOpts); err != nil { + return prepared, err + } + + // track which file we are writing in the persist manager, so we + // know which file to read back on `closeIndex` being called. + pm.indexPM.fileSetIdentifier = fileSetID + pm.indexPM.fileSetType = opts.FileSetType + pm.indexPM.initialized = true + + // provide persistManager hooks into PreparedIndexSnapshotPersist object + prepared.Persist = pm.persistIndexSnapshot + prepared.Close = pm.closeIndex + prepared.VolumeIndex = volumeIndex + + return prepared, nil +} + +func (pm *persistManager) persistIndexSnapshot(data fst.SegmentData) error { + // FOLLOWUP(prateek): need to use-rate limiting runtime options in this code path + markError := func(err error) { + pm.indexPM.writeErr = err + } + if err := pm.indexPM.writeErr; err != nil { + return fmt.Errorf("encountered error: %v, skipping further attempts to persist index snapshot", err) + } + + if err := pm.indexPM.segmentDataWriter.Reset(data); err != nil { + markError(err) + return err + } + + if err := pm.indexPM.writer.WriteSegmentFileSet(pm.indexPM.segmentDataWriter); err != nil { + markError(err) + return err + } + + return nil +} + +func (pm *persistManager) closeIndex() error { // ensure StartIndexPersist was called if !pm.indexPM.initialized { - return nil, errPersistManagerNotPersisting + return errPersistManagerNotPersisting } pm.indexPM.initialized = false - // i.e. we're done writing all segments for PreparedIndexPersist. + // i.e. we're done writing all segments for PreparedIndexFlushPersist. // so we can close the writer. if err := pm.indexPM.writer.Close(); err != nil { - return nil, err + return err } - // only attempt to retrieve data if we have not encountered errors during - // any writes. - if err := pm.indexPM.writeErr; err != nil { + // return any write errors + return pm.indexPM.writeErr +} + +func (pm *persistManager) closeIndexAndReadIndexSegments() ([]segment.Segment, error) { + if err := pm.closeIndex(); err != nil { return nil, err } - - // and then we get persistent segments backed by mmap'd data so the index + // Only attempt to retrieve data if we have not encountered errors during + // any writes. We get persistent segments backed by mmap'd data so the index // can safely evict the segment's we have just persisted. return ReadIndexSegments(ReadIndexSegmentsOptions{ ReaderOptions: IndexReaderOpenOptions{ @@ -386,7 +501,7 @@ func (pm *persistManager) StartFlushPersist() (persist.FlushPreparer, error) { return pm, nil } -// StartSnapshotPersist is called by the databaseFlushManager to begin the snapshot process. +// StartSnapshotPersist is called by the databaseFlushManager to begin the data & index snapshot process. func (pm *persistManager) StartSnapshotPersist(snapshotID uuid.UUID) (persist.SnapshotPreparer, error) { pm.Lock() defer pm.Unlock() @@ -394,9 +509,10 @@ func (pm *persistManager) StartSnapshotPersist(snapshotID uuid.UUID) (persist.Sn if pm.status != persistManagerIdle { return nil, errPersistManagerNotIdle } - pm.status = persistManagerPersistingData + pm.status = persistManagerPersistingSnapshot pm.dataPM.fileSetType = persist.FileSetSnapshotType pm.dataPM.snapshotID = snapshotID + pm.indexPM.snapshotID = snapshotID return pm, nil } @@ -413,27 +529,31 @@ func (pm *persistManager) PrepareData(opts persist.DataPrepareOptions) (persist. prepared persist.PreparedDataPersist ) - // ensure StartDataPersist has been called - pm.RLock() - status := pm.status - pm.RUnlock() - - if status != persistManagerPersistingData { - return prepared, errPersistManagerCannotPrepareDataNotPersisting - } - exists, err := pm.dataFilesetExists(opts) if err != nil { return prepared, err } + // ensure StartDataPersist or StartSnapshotPersist has been called + pm.RLock() + status := pm.status + pm.RUnlock() + var volumeIndex int switch opts.FileSetType { case persist.FileSetFlushType: + if status != persistManagerPersistingData { + return prepared, errPersistManagerCannotPrepareDataNotPersisting + } + // Use the volume index passed in. This ensures that the volume index is // the same as the cold flush version. volumeIndex = opts.VolumeIndex case persist.FileSetSnapshotType: + if status != persistManagerPersistingSnapshot { + return prepared, errPersistManagerCannotPrepareDataNotPersistingSnapshot + } + // Need to work out the volume index for the next snapshot. volumeIndex, err = NextSnapshotFileSetVolumeIndex(pm.opts.FilePathPrefix(), nsMetadata.ID(), shard, blockStart) @@ -572,7 +692,7 @@ func (pm *persistManager) DoneSnapshot( pm.Lock() defer pm.Unlock() - if pm.status != persistManagerPersistingData { + if pm.status != persistManagerPersistingSnapshot { return errPersistManagerNotPersisting } diff --git a/src/dbnode/persist/fs/persist_manager_test.go b/src/dbnode/persist/fs/persist_manager_test.go index b582680671..7e1400cec6 100644 --- a/src/dbnode/persist/fs/persist_manager_test.go +++ b/src/dbnode/persist/fs/persist_manager_test.go @@ -32,6 +32,7 @@ import ( "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" m3ninxfs "github.com/m3db/m3/src/m3ninx/index/segment/fst" m3ninxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/checked" @@ -249,6 +250,7 @@ func TestPersistenceManagerPrepareSnapshotSuccess(t *testing.T) { Snapshot: DataWriterSnapshotOptions{ SnapshotID: testSnapshotID, }, + FileSetType: persist.FileSetSnapshotType, }, m3test.IdentTransformer) writer.EXPECT().Open(writerOpts).Return(nil) @@ -289,6 +291,7 @@ func TestPersistenceManagerPrepareSnapshotSuccess(t *testing.T) { NamespaceMetadata: testNs1Metadata(t), Shard: shard, BlockStart: blockStart, + FileSetType: persist.FileSetSnapshotType, } prepared, err := flush.PrepareData(prepareOpts) defer prepared.Close() @@ -317,16 +320,16 @@ func TestPersistenceManagerCloseIndex(t *testing.T) { ctrl := gomock.NewController(xtest.Reporter{T: t}) defer ctrl.Finish() - pm, _, _, _ := testIndexPersistManager(t, ctrl) + pm, _, _, _, _ := testIndexPersistManager(t, ctrl) defer os.RemoveAll(pm.filePathPrefix) pm.closeIndex() } -func TestPersistenceManagerPrepareIndexFileExists(t *testing.T) { +func TestPersistenceManagerPrepareIndexFlushFileExists(t *testing.T) { ctrl := gomock.NewController(xtest.Reporter{T: t}) defer ctrl.Finish() - pm, writer, segWriter, _ := testIndexPersistManager(t, ctrl) + pm, writer, segWriter, segDataWriter, _ := testIndexPersistManager(t, ctrl) defer os.RemoveAll(pm.filePathPrefix) blockStart := time.Unix(1000, 0) @@ -344,6 +347,7 @@ func TestPersistenceManagerPrepareIndexFileExists(t *testing.T) { defer func() { segWriter.EXPECT().Reset(nil) + segDataWriter.EXPECT().Reset(fst.SegmentData{}) assert.NoError(t, flush.DoneIndex()) }() @@ -362,17 +366,17 @@ func TestPersistenceManagerPrepareIndexFileExists(t *testing.T) { }, }, m3test.IdentTransformer), ).Return(nil) - prepared, err := flush.PrepareIndex(prepareOpts) + prepared, err := flush.PrepareIndexFlush(prepareOpts) require.NoError(t, err) require.NotNil(t, prepared.Persist) require.NotNil(t, prepared.Close) } -func TestPersistenceManagerPrepareIndexOpenError(t *testing.T) { +func TestPersistenceManagerPrepareIndexFlushOpenError(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() - pm, writer, segWriter, _ := testIndexPersistManager(t, ctrl) + pm, writer, segWriter, segDataWriter, _ := testIndexPersistManager(t, ctrl) defer os.RemoveAll(pm.filePathPrefix) ns1Md := testNs1Metadata(t) @@ -394,6 +398,7 @@ func TestPersistenceManagerPrepareIndexOpenError(t *testing.T) { defer func() { segWriter.EXPECT().Reset(nil) + segDataWriter.EXPECT().Reset(fst.SegmentData{}) assert.NoError(t, flush.DoneIndex()) }() @@ -401,17 +406,17 @@ func TestPersistenceManagerPrepareIndexOpenError(t *testing.T) { NamespaceMetadata: ns1Md, BlockStart: blockStart, } - prepared, err := flush.PrepareIndex(prepareOpts) + prepared, err := flush.PrepareIndexFlush(prepareOpts) require.Equal(t, expectedErr, err) require.Nil(t, prepared.Persist) require.Nil(t, prepared.Close) } -func TestPersistenceManagerPrepareIndexSuccess(t *testing.T) { +func TestPersistenceManagerPrepareIndexFlushSuccess(t *testing.T) { ctrl := gomock.NewController(xtest.Reporter{T: t}) defer ctrl.Finish() - pm, writer, segWriter, _ := testIndexPersistManager(t, ctrl) + pm, writer, segWriter, segDataWriter, _ := testIndexPersistManager(t, ctrl) defer os.RemoveAll(pm.filePathPrefix) blockStart := time.Unix(1000, 0) @@ -430,6 +435,7 @@ func TestPersistenceManagerPrepareIndexSuccess(t *testing.T) { defer func() { segWriter.EXPECT().Reset(nil) + segDataWriter.EXPECT().Reset(fst.SegmentData{}) assert.NoError(t, flush.DoneIndex()) }() @@ -437,7 +443,7 @@ func TestPersistenceManagerPrepareIndexSuccess(t *testing.T) { NamespaceMetadata: testNs1Metadata(t), BlockStart: blockStart, } - prepared, err := flush.PrepareIndex(prepareOpts) + prepared, err := flush.PrepareIndexFlush(prepareOpts) require.NoError(t, err) seg := segment.NewMockMutableSegment(ctrl) @@ -475,6 +481,59 @@ func TestPersistenceManagerPrepareIndexSuccess(t *testing.T) { require.Equal(t, fsSeg, segs[0]) } +func TestPersistenceManagerPrepareIndexSnapshotSuccess(t *testing.T) { + ctrl := gomock.NewController(xtest.Reporter{T: t}) + defer ctrl.Finish() + + pm, writer, segWriter, segDataWriter, _ := testIndexPersistManager(t, ctrl) + defer os.RemoveAll(pm.filePathPrefix) + + blockStart := time.Unix(1000, 0) + writerOpts := IndexWriterOpenOptions{ + Identifier: FileSetFileIdentifier{ + FileSetContentType: persist.FileSetIndexContentType, + Namespace: testNs1ID, + BlockStart: blockStart, + }, + FileSetType: persist.FileSetSnapshotType, + BlockSize: testBlockSize, + Snapshot: IndexWriterSnapshotOptions{ + SnapshotID: testSnapshotID, + }, + } + writer.EXPECT().Open(xtest.CmpMatcher(writerOpts, m3test.IdentTransformer)).Return(nil) + + flush, err := pm.StartSnapshotPersist(testSnapshotID) + require.NoError(t, err) + + defer func() { + segWriter.EXPECT().Reset(nil) + segDataWriter.EXPECT().Reset(fst.SegmentData{}) + assert.NoError(t, flush.DoneSnapshot(nil, persist.CommitLogFile{})) + }() + + prepareOpts := persist.IndexPrepareSnapshotOptions{ + IndexPrepareOptions: persist.IndexPrepareOptions{ + NamespaceMetadata: testNs1Metadata(t), + BlockStart: blockStart, + FileSetType: persist.FileSetSnapshotType, + }, + } + prepared, err := flush.PrepareIndexSnapshot(prepareOpts) + require.NoError(t, err) + + segData := fst.SegmentData{ + Version: fst.CurrentVersion, + } + segDataWriter.EXPECT().Reset(segData).Return(nil) + writer.EXPECT().WriteSegmentFileSet(segDataWriter).Return(nil) + require.NoError(t, prepared.Persist(segData)) + + writer.EXPECT().Close().Return(nil) + err = prepared.Close() + require.NoError(t, err) +} + func TestPersistenceManagerNoRateLimit(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -745,7 +804,13 @@ func testDataPersistManager( } func testIndexPersistManager(t *testing.T, ctrl *gomock.Controller, -) (*persistManager, *MockIndexFileSetWriter, *m3ninxpersist.MockMutableSegmentFileSetWriter, Options) { +) ( + *persistManager, + *MockIndexFileSetWriter, + *m3ninxpersist.MockMutableSegmentFileSetWriter, + *m3ninxpersist.MockFSTSegmentDataFileSetWriter, + Options, +) { dir := createTempDir(t) opts := testDefaultOpts. @@ -754,6 +819,7 @@ func testIndexPersistManager(t *testing.T, ctrl *gomock.Controller, writer := NewMockIndexFileSetWriter(ctrl) segmentWriter := m3ninxpersist.NewMockMutableSegmentFileSetWriter(ctrl) + segmentDataWriter := m3ninxpersist.NewMockFSTSegmentDataFileSetWriter(ctrl) mgr, err := NewPersistManager(opts) require.NoError(t, err) @@ -761,5 +827,6 @@ func testIndexPersistManager(t *testing.T, ctrl *gomock.Controller, manager := mgr.(*persistManager) manager.indexPM.writer = writer manager.indexPM.segmentWriter = segmentWriter - return manager, writer, segmentWriter, opts + manager.indexPM.segmentDataWriter = segmentDataWriter + return manager, writer, segmentWriter, segmentDataWriter, opts } diff --git a/src/dbnode/persist/fs/types.go b/src/dbnode/persist/fs/types.go index 7aa7d9fee6..b71419e3c0 100644 --- a/src/dbnode/persist/fs/types.go +++ b/src/dbnode/persist/fs/types.go @@ -49,6 +49,8 @@ import ( "github.com/m3db/m3/src/x/pool" "github.com/m3db/m3/src/x/serialize" xtime "github.com/m3db/m3/src/x/time" + + "github.com/pborman/uuid" ) // FileSetFileIdentifier contains all the information required to identify a FileSetFile @@ -347,6 +349,7 @@ type RetrievableDataBlockSegmentReader interface { // IndexWriterSnapshotOptions is a set of options for writing an index file set snapshot. type IndexWriterSnapshotOptions struct { SnapshotTime time.Time + SnapshotID uuid.UUID } // IndexWriterOpenOptions is a set of options when opening an index file set writer. @@ -719,3 +722,24 @@ type CrossBlockIterator interface { // Reset resets the iterator to the given block records. Reset(records []BlockRecord) } + +// ReadIndexInfoFilesFn reads in index info files given a namespace. +type ReadIndexInfoFilesFn func( + filePathPrefix string, + namespace ident.ID, + readerBufferSize int, + fileSetType persist.FileSetType, +) []ReadIndexInfoFileResult + +// SnapshotFilesFn reads in snapshot files given a namespace and shard. +type SnapshotFilesFn func( + filePathPrefix string, + namespace ident.ID, + shard uint32, +) (FileSetFilesSlice, error) + +// IndexSnapshotFilesFn reads in index snapshot files given a namespace. +type IndexSnapshotFilesFn func( + filePathPrefix string, + namespace ident.ID, +) (FileSetFilesSlice, error) diff --git a/src/dbnode/persist/fs/write.go b/src/dbnode/persist/fs/write.go index 5b2c17e6a6..f1736ace45 100644 --- a/src/dbnode/persist/fs/write.go +++ b/src/dbnode/persist/fs/write.go @@ -604,7 +604,7 @@ func (w *writer) writeInfoFileContents( summaries int, entriesCount int64, ) error { - snapshotBytes, err := w.snapshotID.MarshalBinary() + snapshotIDBytes, err := w.snapshotID.MarshalBinary() if err != nil { return fmt.Errorf("error marshaling snapshot ID into bytes: %v", err) } @@ -613,7 +613,7 @@ func (w *writer) writeInfoFileContents( BlockStart: xtime.ToNanoseconds(w.start), VolumeIndex: w.volumeIndex, SnapshotTime: xtime.ToNanoseconds(w.snapshotTime), - SnapshotID: snapshotBytes, + SnapshotID: snapshotIDBytes, BlockSize: int64(w.blockSize), Entries: entriesCount, MajorVersion: schema.MajorVersion, diff --git a/src/dbnode/persist/persist_mock.go b/src/dbnode/persist/persist_mock.go index 1ec52dcd02..3ee7b13787 100644 --- a/src/dbnode/persist/persist_mock.go +++ b/src/dbnode/persist/persist_mock.go @@ -239,6 +239,21 @@ func (mr *MockSnapshotPreparerMockRecorder) PrepareData(opts interface{}) *gomoc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PrepareData", reflect.TypeOf((*MockSnapshotPreparer)(nil).PrepareData), opts) } +// PrepareIndexSnapshot mocks base method +func (m *MockSnapshotPreparer) PrepareIndexSnapshot(opts IndexPrepareSnapshotOptions) (PreparedIndexSnapshotPersist, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "PrepareIndexSnapshot", opts) + ret0, _ := ret[0].(PreparedIndexSnapshotPersist) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// PrepareIndexSnapshot indicates an expected call of PrepareIndexSnapshot +func (mr *MockSnapshotPreparerMockRecorder) PrepareIndexSnapshot(opts interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PrepareIndexSnapshot", reflect.TypeOf((*MockSnapshotPreparer)(nil).PrepareIndexSnapshot), opts) +} + // DoneSnapshot mocks base method func (m *MockSnapshotPreparer) DoneSnapshot(snapshotUUID uuid.UUID, commitLogIdentifier CommitLogFile) error { m.ctrl.T.Helper() @@ -276,19 +291,19 @@ func (m *MockIndexFlush) EXPECT() *MockIndexFlushMockRecorder { return m.recorder } -// PrepareIndex mocks base method -func (m *MockIndexFlush) PrepareIndex(opts IndexPrepareOptions) (PreparedIndexPersist, error) { +// PrepareIndexFlush mocks base method +func (m *MockIndexFlush) PrepareIndexFlush(opts IndexPrepareOptions) (PreparedIndexFlushPersist, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "PrepareIndex", opts) - ret0, _ := ret[0].(PreparedIndexPersist) + ret := m.ctrl.Call(m, "PrepareIndexFlush", opts) + ret0, _ := ret[0].(PreparedIndexFlushPersist) ret1, _ := ret[1].(error) return ret0, ret1 } -// PrepareIndex indicates an expected call of PrepareIndex -func (mr *MockIndexFlushMockRecorder) PrepareIndex(opts interface{}) *gomock.Call { +// PrepareIndexFlush indicates an expected call of PrepareIndexFlush +func (mr *MockIndexFlushMockRecorder) PrepareIndexFlush(opts interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PrepareIndex", reflect.TypeOf((*MockIndexFlush)(nil).PrepareIndex), opts) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PrepareIndexFlush", reflect.TypeOf((*MockIndexFlush)(nil).PrepareIndexFlush), opts) } // DoneIndex mocks base method diff --git a/src/dbnode/persist/types.go b/src/dbnode/persist/types.go index a24d4040b1..ca30ed84bf 100644 --- a/src/dbnode/persist/types.go +++ b/src/dbnode/persist/types.go @@ -29,6 +29,7 @@ import ( "github.com/m3db/m3/src/dbnode/ts" "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/ident" @@ -153,7 +154,7 @@ type DataCloser func() error // DeferCloser returns a DataCloser that persists the data checkpoint file when called. type DeferCloser func() (DataCloser, error) -// PreparedDataPersist is an object that wraps holds a persist function and a closer. +// PreparedDataPersist is an object that wraps a persist function and a closer. type PreparedDataPersist struct { Persist DataFn Close DataCloser @@ -180,20 +181,35 @@ type CommitLogFile struct { Index int64 } -// IndexFn is a function that persists a m3ninx MutableSegment. -type IndexFn func(segment.Builder) error +// IndexFlushFn is a function that persists a m3ninx MutableSegment. +type IndexFlushFn func(segment.Builder) error + +// IndexSnapshotFn is a function that persists fst SegmentData. +type IndexSnapshotFn func(fst.SegmentData) error // IndexCloser is a function that performs cleanup after persisting the index data // block for a (namespace, blockStart) combination and returns the corresponding // immutable Segment. type IndexCloser func() ([]segment.Segment, error) -// PreparedIndexPersist is an object that wraps holds a persist function and a closer. -type PreparedIndexPersist struct { - Persist IndexFn +// PreparedIndexFlushPersist is an object that wraps a index data persist function and a closer. +type PreparedIndexFlushPersist struct { + Persist IndexFlushFn Close IndexCloser } +// IndexSnapshotCloser is a function that performs cleanup after persisting the index snapshots +// for a (namespace, blockStart) combination. +type IndexSnapshotCloser func() error + +// PreparedIndexSnapshotPersist is an object that wraps holds a index snapshot +// persist function and a closer. +type PreparedIndexSnapshotPersist struct { + Persist IndexSnapshotFn + Close IndexSnapshotCloser + VolumeIndex int +} + // Manager manages the internals of persisting data onto storage layer. type Manager interface { // StartFlushPersist begins a data flush for a set of shards. @@ -231,6 +247,11 @@ type FlushPreparer interface { type SnapshotPreparer interface { Preparer + // PrepareIndexSnapshot prepares snapshotting index data for a given ns/blockStart, returning a + // PreparedIndexSnapshotPersist object and any error encountered during + // preparation if any. + PrepareIndexSnapshot(opts IndexPrepareSnapshotOptions) (PreparedIndexSnapshotPersist, error) + // DoneSnapshot marks the snapshot as complete. DoneSnapshot(snapshotUUID uuid.UUID, commitLogIdentifier CommitLogFile) error } @@ -238,10 +259,10 @@ type SnapshotPreparer interface { // IndexFlush is a persist flush cycle, each namespace, block combination needs // to explicitly be prepared. type IndexFlush interface { - // Prepare prepares writing data for a given ns/blockStart, returning a - // PreparedIndexPersist object and any error encountered during + // PrepareIndexFlush prepares flushing index data for a given ns/blockStart, returning a + // PreparedIndexFlushPersist object and any error encountered during // preparation if any. - PrepareIndex(opts IndexPrepareOptions) (PreparedIndexPersist, error) + PrepareIndexFlush(opts IndexPrepareOptions) (PreparedIndexFlushPersist, error) // DoneIndex marks the index flush as complete. DoneIndex() error @@ -271,13 +292,20 @@ type IndexPrepareOptions struct { FileSetType FileSetType Shards map[uint32]struct{} IndexVolumeType idxpersist.IndexVolumeType + SnapshotTime time.Time +} + +// IndexPrepareSnapshotOptions is the options struct for preparing index snapshots. +type IndexPrepareSnapshotOptions struct { + IndexPrepareOptions + + SnapshotTime time.Time } // DataPrepareSnapshotOptions is the options struct for the Prepare method that contains // information specific to read/writing snapshot files. type DataPrepareSnapshotOptions struct { SnapshotTime time.Time - SnapshotID uuid.UUID } // FileSetType is an enum that indicates what type of files a fileset contains diff --git a/src/dbnode/storage/bootstrap/bootstrap_mock.go b/src/dbnode/storage/bootstrap/bootstrap_mock.go index 7ecbc95f65..bee4bbb985 100644 --- a/src/dbnode/storage/bootstrap/bootstrap_mock.go +++ b/src/dbnode/storage/bootstrap/bootstrap_mock.go @@ -902,3 +902,17 @@ func (mr *MockSeriesRefMockRecorder) LoadBlock(block, writeType interface{}) *go mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LoadBlock", reflect.TypeOf((*MockSeriesRef)(nil).LoadBlock), block, writeType) } + +// LoadBlockAndIndex mocks base method +func (m *MockSeriesRef) LoadBlockAndIndex(block block.DatabaseBlock, writeType series.WriteType) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "LoadBlockAndIndex", block, writeType) + ret0, _ := ret[0].(error) + return ret0 +} + +// LoadBlockAndIndex indicates an expected call of LoadBlockAndIndex +func (mr *MockSeriesRefMockRecorder) LoadBlockAndIndex(block, writeType interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LoadBlockAndIndex", reflect.TypeOf((*MockSeriesRef)(nil).LoadBlockAndIndex), block, writeType) +} diff --git a/src/dbnode/storage/bootstrap/bootstrapper/README.md b/src/dbnode/storage/bootstrap/bootstrapper/README.md deleted file mode 100644 index b062aaac3d..0000000000 --- a/src/dbnode/storage/bootstrap/bootstrapper/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# bootstrapper - -The collection of bootstrappers comprise the task executed when bootstrapping a node. - -## Bootstrappers - -- `fs`: The filesystem bootstrapper, used to bootstrap as much data as possible from the local filesystem. -- `peers`: The peers bootstrapper, used to bootstrap any remaining data from peers. This is used for a full node join too. -- `commitlog`: The commit log bootstrapper, currently only used in the case that peers bootstrapping fails. Once the current block is being snapshotted frequently to disk it might be faster and make more sense to not actively use the peers bootstrapper and just use a combination of the filesystem bootstrapper and the minimal time range required from the commit log bootstrapper. - - *NOTE*: the commitlog bootstrapper is special cased in that it runs for the *entire* bootstrappable range per shard whereas other bootstrappers fill in the unfulfilled gaps as bootstrapping progresses. - -## Cache policies - -The tasks carried out by each bootstrapper vary a lot on the series cache policy being used. - -### CacheAll series cache policy - -For the cache all policy the filesystem bootstrapper will load all series and all the data for each block and return the entire set of data. This will keep every series and series block on heap. - -The peers bootstrapper similarly bootstraps all the data from peers that the filesystem does not have and returns the entire set of data fetched. - -### RecentlyRead series cache policy - -For the recently read policy the filesystem bootstrapper will simply fulfill the time ranges requested matching without actually loading the series and blocks from the files it discovers. This relies on data been fetched lazily from the filesystem when data is required for a series that does not live on heap. - -The peers bootstrapper will bootstrap all time ranges requested, and if performing a bootstrap with persistence enabled for a time range, will write the data to disk and then remove the results from memory. A bootstrap with persistence enabled is used for any data that is immutable at the time that bootstrapping commences. For time ranges that are mutable the peer bootstrapper will still write the data out to disk in a durable manner, but in the form of a snapshot, and the series and blocks will still be returned directly as a result from the bootstrapper. This enables the commit log bootstrapper to recover the data in case the node shuts down before the in-memory data can be flushed. - -## Topology Changes - -When nodes are added to a replication group, shards are given away to the joining node. Those shards are closed and we re-bootstrap with the shards that we own. -When nodes are removed from a replication group, shards from the removed node are given to remaining nodes in a replication group. The remaining nodes in the replication group will bootstrap the "new" shards that were assigned to it. -Note that we also take writes for shards that we own while bootstrapping. However, we do not allow warm/cold flushes to happen while bootstrapping. - -For example, see the following sequences: -(Node add) -- Node 1: - - Initial bootstrap (256 shards) - - Node add - - Bootstrap (128 shards) // These are the remaining shards it owns. -- Node 2: - - Node add - - Inital bootstrap (128 shards) // These are received from Node 1 - -(Node remove) -- Node 1: - - Node remove - - Bootstrap (128 shards) // These are received form Node 2, it owns 256 now. -- Node 2: - - Node remove diff --git a/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source.go b/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source.go index 6e2f810449..725cd9a45f 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source.go @@ -29,16 +29,20 @@ import ( "time" "github.com/m3db/m3/src/cluster/shard" + "github.com/m3db/m3/src/dbnode/generated/proto/index" "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/persist/fs/commitlog" "github.com/m3db/m3/src/dbnode/storage/bootstrap" + "github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper" "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" "github.com/m3db/m3/src/dbnode/storage/series" "github.com/m3db/m3/src/dbnode/topology" "github.com/m3db/m3/src/dbnode/tracepoint" "github.com/m3db/m3/src/dbnode/ts" + "github.com/m3db/m3/src/m3ninx/index/segment" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/checked" "github.com/m3db/m3/src/x/context" "github.com/m3db/m3/src/x/ident" @@ -57,8 +61,8 @@ const ( type newIteratorFn func(opts commitlog.IteratorOpts) ( iter commitlog.Iterator, corruptFiles []commitlog.ErrorWithPath, err error) -type snapshotFilesFn func(filePathPrefix string, namespace ident.ID, shard uint32) (fs.FileSetFilesSlice, error) -type newReaderFn func(bytesPool pool.CheckedBytesPool, opts fs.Options) (fs.DataFileSetReader, error) +type newDataReaderFn func(bytesPool pool.CheckedBytesPool, opts fs.Options) (fs.DataFileSetReader, error) +type readIndexSegmentsFn func(opts fs.ReadIndexSegmentsOptions) ([]segment.Segment, error) type commitLogSource struct { opts Options @@ -68,9 +72,12 @@ type commitLogSource struct { // Filesystem inspection capture before node was started. inspection fs.Inspection - newIteratorFn newIteratorFn - snapshotFilesFn snapshotFilesFn - newReaderFn newReaderFn + newIteratorFn newIteratorFn + snapshotFilesFn fs.SnapshotFilesFn + indexSnapshotFilesFn fs.IndexSnapshotFilesFn + newDataReaderFn newDataReaderFn + readIndexSegmentsFn readIndexSegmentsFn + readIndexInfoFilesFn fs.ReadIndexInfoFilesFn metrics commitLogSourceMetrics // Cache the results of reading the commit log between passes. The commit log is not sharded by time range, so the @@ -141,9 +148,12 @@ func newCommitLogSource( inspection: inspection, - newIteratorFn: commitlog.NewIterator, - snapshotFilesFn: fs.SnapshotFiles, - newReaderFn: fs.NewReader, + newIteratorFn: commitlog.NewIterator, + snapshotFilesFn: fs.SnapshotFiles, + indexSnapshotFilesFn: fs.IndexSnapshotFiles, + newDataReaderFn: fs.NewReader, + readIndexSegmentsFn: fs.ReadIndexSegments, + readIndexInfoFilesFn: fs.ReadIndexInfoFiles, metrics: newCommitLogSourceMetrics(scope), } @@ -190,6 +200,7 @@ func (s *commitLogSource) Read( fsOpts = s.opts.CommitLogOptions().FilesystemOptions() filePathPrefix = fsOpts.FilePathPrefix() namespaceIter = namespaces.Namespaces.Iter() + indexResults = make(map[string]result.IndexBootstrapResult, len(namespaceIter)) ) defer doneReadingData() @@ -217,15 +228,11 @@ func (s *commitLogSource) Read( if err != nil { return bootstrap.NamespaceResults{}, err } - - mostRecentCompleteSnapshotByBlockShard, err := s.mostRecentSnapshotByBlockShard( - ns.Metadata, shardTimeRanges, snapshotFilesByShard) - if err != nil { - return bootstrap.NamespaceResults{}, err - } + blockSize := ns.Metadata.Options().RetentionOptions().BlockSize() + mostRecentCompleteSnapshotByBlockShard := s.mostRecentCompleteSnapshotByBlockShard( + shardTimeRanges, blockSize, snapshotFilesByShard, fsOpts) // Start by reading any available snapshot files. - blockSize := ns.Metadata.Options().RetentionOptions().BlockSize() for shard, tr := range shardTimeRanges.Iter() { err := s.bootstrapShardSnapshots( ns.Metadata, accumulator, shard, tr, blockSize, @@ -234,6 +241,32 @@ func (s *commitLogSource) Read( return bootstrap.NamespaceResults{}, err } } + + // Read index snapshot files if indexing is enabled. + if ns.Metadata.Options().IndexOptions().Enabled() { + indexSnapshotFiles, err := s.indexSnapshotFilesFn( + filePathPrefix, + ns.Metadata.ID(), + ) + if err != nil { + return bootstrap.NamespaceResults{}, err + } + indexBlockSize := ns.Metadata.Options().IndexOptions().BlockSize() + indexInfoFiles := s.readIndexInfoFilesFn(filePathPrefix, ns.Metadata.ID(), + fsOpts.InfoReaderBufferSize(), persist.FileSetSnapshotType) + // Get latest index snapshot per block start + mostRecentIndexSnapshotsByBlock := s.mostRecentCompleteIndexSnapshotByBlock( + shardTimeRanges, indexBlockSize, indexSnapshotFiles, indexInfoFiles, fsOpts) + indexResult := result.NewIndexBootstrapResult() + if err := s.bootstrapIndexSnapshots( + ns.Metadata, + indexResult, + mostRecentIndexSnapshotsByBlock, + ); err != nil { + return bootstrap.NamespaceResults{}, err + } + indexResults[ns.Metadata.ID().String()] = indexResult + } } s.log.Info("read snapshots done", @@ -263,7 +296,8 @@ func (s *commitLogSource) Read( } var indexResult result.IndexBootstrapResult if ns.Metadata.Options().IndexOptions().Enabled() { - indexResult = result.NewIndexBootstrapResult() + // We should have a result for each ns at this point if indexing is enabled. + indexResult = indexResults[ns.Metadata.ID().String()] if s.commitLogResult.shouldReturnUnfulfilled { shardTimeRanges := ns.IndexRunOptions.ShardTimeRanges indexResult = shardTimeRanges.ToUnfulfilledIndexResult() @@ -283,7 +317,7 @@ func (s *commitLogSource) Read( type commitLogResult struct { shouldReturnUnfulfilled bool // ensures we only read the commit log once - read bool + read bool } func (s *commitLogSource) readCommitLog(namespaces bootstrap.Namespaces, span opentracing.Span) (commitLogResult, error) { @@ -617,8 +651,7 @@ func (s *commitLogSource) snapshotFilesByShard( // mostRecentCompleteSnapshotByBlockShard returns a // map[xtime.UnixNano]map[uint32]fs.FileSetFile with the contract that // for each shard/block combination in shardsTimeRanges, an entry will -// exist in the map such that FileSetFile.CachedSnapshotTime is the -// actual cached snapshot time, or the blockStart. +// exist in the map if a valid snapshot was found. func (s *commitLogSource) mostRecentCompleteSnapshotByBlockShard( shardsTimeRanges result.ShardTimeRanges, blockSize time.Duration, @@ -640,17 +673,15 @@ func (s *commitLogSource) mostRecentCompleteSnapshotByBlockShard( ) defer func() { + if mostRecentSnapshot.IsZero() { + // Do nothing if we could not find a valid snapshot. + return + } + existing := mostRecentSnapshotsByBlockShard[currBlockUnixNanos] if existing == nil { existing = map[uint32]fs.FileSetFile{} } - - if mostRecentSnapshot.IsZero() { - // If we were unable to determine the most recent snapshot time for a given - // shard/blockStart combination, then just fall back to using the blockStart - // time as that will force us to read the entire commit log for that duration. - mostRecentSnapshot.CachedSnapshotTime = currBlockStart - } existing[shard] = mostRecentSnapshot mostRecentSnapshotsByBlockShard[currBlockUnixNanos] = existing }() @@ -669,8 +700,7 @@ func (s *commitLogSource) mostRecentCompleteSnapshotByBlockShard( return } - // Make sure we're able to read the snapshot time. This will also set the - // CachedSnapshotTime field so that we can rely upon it from here on out. + // Make sure we're able to read the snapshot time. _, _, err := mostRecentSnapshotVolume.SnapshotTimeAndID() if err != nil { namespace := mostRecentSnapshot.ID.Namespace @@ -687,9 +717,6 @@ func (s *commitLogSource) mostRecentCompleteSnapshotByBlockShard( zap.Error(err), ). Error("error resolving snapshot time for snapshot file") - - // If we couldn't determine the snapshot time for the snapshot file, then rely - // on the defer to fallback to using the block start time. return } @@ -701,6 +728,99 @@ func (s *commitLogSource) mostRecentCompleteSnapshotByBlockShard( return mostRecentSnapshotsByBlockShard } +type indexSnapshot struct { + fileSet fs.FileSetFile + info index.IndexVolumeInfo + fulfilled result.ShardTimeRanges +} + +// mostRecentCompleteIndexSnapshotByBlock returns a +// map[xtime.UnixNano]fs.FileSetFile with the contract that +// for each block in shardsTimeRanges, an entry will +// exist in the map if there exists a valid snapshot. +func (s *commitLogSource) mostRecentCompleteIndexSnapshotByBlock( + shardsTimeRanges result.ShardTimeRanges, + blockSize time.Duration, + indexSnapshotFiles fs.FileSetFilesSlice, + indexInfoFiles []fs.ReadIndexInfoFileResult, + fsOpts fs.Options, +) map[xtime.UnixNano]indexSnapshot { + var ( + minBlock, maxBlock = shardsTimeRanges.MinMax() + mostRecentIndexSnapshotsByBlock = map[xtime.UnixNano]indexSnapshot{} + latestIndexVolumeInfoByBlockStart = map[xtime.UnixNano]index.IndexVolumeInfo{} + ) + + for _, info := range indexInfoFiles { + latestIndexVolumeInfoByBlockStart[xtime.ToUnixNano(info.ID.BlockStart)] = info.Info + } + + for currBlockStart := minBlock.Truncate(blockSize); currBlockStart.Before(maxBlock); currBlockStart = currBlockStart.Add(blockSize) { + // Anonymous func for easier clean up using defer. + func() { + var ( + currBlockUnixNanos = xtime.ToUnixNano(currBlockStart) + mostRecentSnapshot fs.FileSetFile + indexVolumeInfo index.IndexVolumeInfo + ok bool + ) + + defer func() { + if mostRecentSnapshot.IsZero() { + // Do nothing if no valid snapshot is found. + return + } + fulfilled := bootstrapper.IntersectingShardTimeRanges( + shardsTimeRanges, + indexVolumeInfo.Shards, + currBlockStart, + blockSize, + ) + mostRecentIndexSnapshotsByBlock[currBlockUnixNanos] = indexSnapshot{ + fileSet: mostRecentSnapshot, + info: indexVolumeInfo, + fulfilled: fulfilled, + } + }() + + indexVolumeInfo, ok = latestIndexVolumeInfoByBlockStart[currBlockUnixNanos] + if !ok { + // If there are no index info files for this block, then rely on + // the defer to fallback to using the block start time. + return + } + + mostRecentSnapshotVolume, ok := indexSnapshotFiles.LatestVolumeForBlock(currBlockStart) + if !ok { + // If there are no complete snapshot files for this block, then rely on + // the defer to fallback to using the block start time. + return + } + + // Make sure we're able to read the snapshot time. + _, _, err := mostRecentSnapshotVolume.SnapshotTimeAndID() + if err != nil { + namespace := mostRecentSnapshot.ID.Namespace + if namespace == nil { + namespace = ident.StringID("") + } + s.log.With( + zap.Stringer("namespace", namespace), + zap.Time("blockStart", mostRecentSnapshot.ID.BlockStart), + zap.Int("index", mostRecentSnapshot.ID.VolumeIndex), + zap.Strings("filepaths", mostRecentSnapshot.AbsoluteFilePaths), + zap.Error(err), + ).Error("error resolving snapshot time for index snapshot file") + return + } + + mostRecentSnapshot = mostRecentSnapshotVolume + }() + } + + return mostRecentIndexSnapshotsByBlock +} + func (s *commitLogSource) bootstrapShardSnapshots( ns namespace.Metadata, accumulator bootstrap.NamespaceDataAccumulator, @@ -751,17 +871,14 @@ func (s *commitLogSource) bootstrapShardSnapshots( } for blockStart := currRange.Start.Truncate(blockSize); blockStart.Before(currRange.End); blockStart = blockStart.Add(blockSize) { - snapshotsForBlock := mostRecentCompleteSnapshotByBlockShard[xtime.ToUnixNano(blockStart)] - mostRecentCompleteSnapshotForShardBlock := snapshotsForBlock[shard] - - if mostRecentCompleteSnapshotForShardBlock.CachedSnapshotTime.Equal(blockStart) || - // Should never happen - mostRecentCompleteSnapshotForShardBlock.IsZero() { - // There is no snapshot file for this time, and even if there was, there would - // be no point in reading it. In this specific case its not an error scenario - // because the fact that snapshotTime == blockStart means we already accounted - // for the fact that this snapshot did not exist when we were deciding which - // commit logs to read. + snapshotsForBlock, ok := mostRecentCompleteSnapshotByBlockShard[xtime.ToUnixNano(blockStart)] + if !ok { + s.log.Debug("no snapshots for blockStart", + zap.Time("blockStart", blockStart)) + continue + } + mostRecentCompleteSnapshotForShardBlock, ok := snapshotsForBlock[shard] + if !ok { s.log.Debug("no snapshots for shard and blockStart", zap.Uint32("shard", shard), zap.Time("blockStart", blockStart)) continue @@ -801,7 +918,7 @@ func (s *commitLogSource) bootstrapShardBlockSnapshot( ) // Bootstrap the snapshot file. - reader, err := s.newReaderFn(bytesPool, fsOpts) + reader, err := s.newDataReaderFn(bytesPool, fsOpts) if err != nil { return err } @@ -881,36 +998,74 @@ func (s *commitLogSource) bootstrapShardBlockSnapshot( return nil } -func (s *commitLogSource) mostRecentSnapshotByBlockShard( +func (s *commitLogSource) bootstrapIndexSnapshots( ns namespace.Metadata, - shardsTimeRanges result.ShardTimeRanges, - snapshotFilesByShard map[uint32]fs.FileSetFilesSlice, -) ( - map[xtime.UnixNano]map[uint32]fs.FileSetFile, - error, -) { - blockSize := ns.Options().RetentionOptions().BlockSize() - - mostRecentCompleteSnapshotByBlockShard := s.mostRecentCompleteSnapshotByBlockShard( - shardsTimeRanges, blockSize, snapshotFilesByShard, s.opts.CommitLogOptions().FilesystemOptions()) - for block, mostRecentByShard := range mostRecentCompleteSnapshotByBlockShard { - for shard, mostRecent := range mostRecentByShard { - - if mostRecent.CachedSnapshotTime.IsZero() { - // Should never happen. - return nil, instrument.InvariantErrorf( - "shard: %d and block: %s had zero value for most recent snapshot time", - shard, block.ToTime().String()) - } - - s.log.Debug("most recent snapshot for block", - zap.Time("blockStart", block.ToTime()), - zap.Uint32("shard", shard), - zap.Time("mostRecent", mostRecent.CachedSnapshotTime)) + indexResult result.IndexBootstrapResult, + mostRecentIndexSnapshotsByBlock map[xtime.UnixNano]indexSnapshot, +) error { + iOpts := s.opts.CommitLogOptions().InstrumentOptions() + for blockStart, snapshot := range mostRecentIndexSnapshotsByBlock { + if snapshot.info.IndexVolumeType == nil { + // NB(bodu): This should not happen since we are only writing index snapshots + // with a specified index volume type (either cold or warm). If there was no index + // snapshot for this block start then we are skipping based on above criteria. + instrument.EmitAndLogInvariantViolation(iOpts, func(l *zap.Logger) { + l.Error(fmt.Sprintf("no index volume type for snapshot blockStart: %v", blockStart.ToTime())) + }) + continue + } + indexVolumeType := idxpersist.IndexVolumeType(snapshot.info.IndexVolumeType.Value) + + if err := s.bootstrapIndexBlockSnapshot( + indexResult, + blockStart.ToTime(), + snapshot.fileSet, + snapshot.fulfilled, + indexVolumeType, + ); err != nil { + return err } } - return mostRecentCompleteSnapshotByBlockShard, nil + return nil +} + +func (s *commitLogSource) bootstrapIndexBlockSnapshot( + indexResult result.IndexBootstrapResult, + blockStart time.Time, + mostRecentIndexSnapshot fs.FileSetFile, + fulfilled result.ShardTimeRanges, + indexVolumeType idxpersist.IndexVolumeType, +) error { + var ( + fsOpts = s.opts.CommitLogOptions().FilesystemOptions() + ) + + s.log.Debug("reading index snapshot segments", + zap.Time("blockStart", blockStart), + zap.Int("volume", mostRecentIndexSnapshot.ID.VolumeIndex)) + + // Bootstrap the index snapshot file. + segments, err := s.readIndexSegmentsFn(fs.ReadIndexSegmentsOptions{ + ReaderOptions: fs.IndexReaderOpenOptions{ + Identifier: mostRecentIndexSnapshot.ID, + FileSetType: persist.FileSetSnapshotType, + }, + FilesystemOptions: fsOpts, + }) + if err != nil { + return err + } + snapshottedSegments := make([]result.Segment, 0, len(segments)) + for _, segment := range segments { + // Snapshotted segments have not been persisted to disk yet. + snapshottedSegments = append(snapshottedSegments, result.NewSegment(segment, false)) + } + + indexBlockByVolumeType := result.NewIndexBlockByVolumeType(blockStart) + indexBlockByVolumeType.SetBlock(indexVolumeType, result.NewIndexBlock(snapshottedSegments, fulfilled)) + indexResult.Add(indexBlockByVolumeType, nil) + return nil } // TODO(rartoul): Refactor this to take the SnapshotMetadata files into account to reduce @@ -952,6 +1107,12 @@ func (s *commitLogSource) startAccumulateWorker(worker *accumulateWorker) { ) worker.datapointsRead++ + // TODO(bodu): Currently the entry Write transparently indexes the series if we have + // not yet attempted to index a series yet. For high cardinality workloads w/ low churn, + // this means that we end up doubly indexing series since they may already exist in the + // index snapshot. At some point in the future we should check that a series is not already + // in the index snapshot and/or on disk for cold blocks (this might be difficult/expensive to do) + // for the index block that covers this series write. _, _, err := entry.Series.Write(ctx, dp.Timestamp, dp.Value, unit, annotation, series.WriteOptions{ SchemaDesc: namespace.namespaceContext.Schema, diff --git a/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source_data_test.go b/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source_data_test.go index a747a65ae4..9c67f4077f 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source_data_test.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/commitlog/source_data_test.go @@ -454,7 +454,7 @@ func testItMergesSnapshotsAndCommitLogs(t *testing.T, opts Options, ) mockReader.EXPECT().Read().Return(nil, nil, nil, uint32(0), io.EOF) - src.newReaderFn = func( + src.newDataReaderFn = func( bytesPool pool.CheckedBytesPool, opts fs.Options, ) (fs.DataFileSetReader, error) { diff --git a/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go b/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go index 07943e89d2..6a655b64dc 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go @@ -30,25 +30,18 @@ import ( "github.com/m3db/m3/src/dbnode/persist" "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/persist/fs/migration" - "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/storage/block" "github.com/m3db/m3/src/dbnode/storage/bootstrap" "github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper" "github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper/fs/migrator" "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" - "github.com/m3db/m3/src/dbnode/storage/index" - "github.com/m3db/m3/src/dbnode/storage/index/compaction" - "github.com/m3db/m3/src/dbnode/storage/index/convert" "github.com/m3db/m3/src/dbnode/storage/series" "github.com/m3db/m3/src/dbnode/tracepoint" "github.com/m3db/m3/src/dbnode/ts" - "github.com/m3db/m3/src/m3ninx/doc" - "github.com/m3db/m3/src/m3ninx/index/segment/fst" idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/checked" "github.com/m3db/m3/src/x/context" "github.com/m3db/m3/src/x/ident" - "github.com/m3db/m3/src/x/instrument" "github.com/m3db/m3/src/x/pool" xtime "github.com/m3db/m3/src/x/time" @@ -72,19 +65,20 @@ type newDataFileSetReaderFn func( ) (fs.DataFileSetReader, error) type fileSystemSource struct { - opts Options - fsopts fs.Options - log *zap.Logger - nowFn clock.NowFn - idPool ident.Pool - newReaderFn newDataFileSetReaderFn - newReaderPoolOpts bootstrapper.NewReaderPoolOptions - metrics fileSystemSourceMetrics + opts Options + fsopts fs.Options + log *zap.Logger + nowFn clock.NowFn + idPool ident.Pool + newReaderFn newDataFileSetReaderFn + newReaderPoolOpts bootstrapper.NewReaderPoolOptions + metrics fileSystemSourceMetrics + readIndexInfoFilesFn fs.ReadIndexInfoFilesFn } type fileSystemSourceMetrics struct { - persistedIndexBlocksRead tally.Counter - persistedIndexBlocksWrite tally.Counter + persistedIndexBlocksRead tally.Counter + persistedIndexSnapshotsRead tally.Counter } func newFileSystemSource(opts Options) (bootstrap.Source, error) { @@ -105,9 +99,10 @@ func newFileSystemSource(opts Options) (bootstrap.Source, error) { idPool: opts.IdentifierPool(), newReaderFn: fs.NewReader, metrics: fileSystemSourceMetrics{ - persistedIndexBlocksRead: scope.Counter("persist-index-blocks-read"), - persistedIndexBlocksWrite: scope.Counter("persist-index-blocks-write"), + persistedIndexBlocksRead: scope.Counter("persist-index-blocks-read"), + persistedIndexSnapshotsRead: scope.Counter("persist-index-snapshots-read"), }, + readIndexInfoFilesFn: fs.ReadIndexInfoFiles, } s.newReaderPoolOpts.Alloc = s.newReader @@ -144,13 +139,6 @@ func (s *fileSystemSource) Read( Results: bootstrap.NewNamespaceResultsMap(bootstrap.NamespaceResultsMapOptions{}), } - alloc := s.opts.ResultOptions().IndexDocumentsBuilderAllocator() - segBuilder, err := alloc() - if err != nil { - return bootstrap.NamespaceResults{}, err - } - builder := result.NewIndexBuilder(segBuilder) - // Perform any necessary migrations but don't block bootstrap process on failure. Will update info file // in-memory structures in place if migrations have written new files to disk. This saves us the need from // having to re-read migrated info files. @@ -172,7 +160,7 @@ func (s *fileSystemSource) Read( r, err := s.read(bootstrapDataRunType, md, namespace.DataAccumulator, namespace.DataRunOptions.ShardTimeRanges, - namespace.DataRunOptions.RunOptions, builder, span, cache) + namespace.DataRunOptions.RunOptions, span, cache) if err != nil { return bootstrap.NamespaceResults{}, err } @@ -202,7 +190,7 @@ func (s *fileSystemSource) Read( r, err := s.read(bootstrapIndexRunType, md, namespace.DataAccumulator, namespace.IndexRunOptions.ShardTimeRanges, - namespace.IndexRunOptions.RunOptions, builder, span, cache) + namespace.IndexRunOptions.RunOptions, span, cache) if err != nil { return bootstrap.NamespaceResults{}, err } @@ -325,22 +313,14 @@ func (s *fileSystemSource) bootstrapFromReaders( runResult *runResult, readerPool *bootstrapper.ReaderPool, readersCh <-chan bootstrapper.TimeWindowReaders, - builder *result.IndexBuilder, - persistManager *bootstrapper.SharedPersistManager, - compactor *bootstrapper.SharedCompactor, ) { var ( resultOpts = s.opts.ResultOptions() ) for timeWindowReaders := range readersCh { - // NB(bodu): Since we are re-using the same builder for all bootstrapped index blocks, - // it is not thread safe and requires reset after every processed index block. - builder.Builder().Reset() - s.loadShardReadersDataIntoShardResult(run, ns, accumulator, - runOpts, runResult, resultOpts, timeWindowReaders, readerPool, - builder, persistManager, compactor) + runOpts, runResult, resultOpts, timeWindowReaders, readerPool) } } @@ -391,21 +371,14 @@ func (s *fileSystemSource) loadShardReadersDataIntoShardResult( ropts result.Options, timeWindowReaders bootstrapper.TimeWindowReaders, readerPool *bootstrapper.ReaderPool, - builder *result.IndexBuilder, - persistManager *bootstrapper.SharedPersistManager, - compactor *bootstrapper.SharedCompactor, ) { var ( blockPool = ropts.DatabaseBlockOptions().DatabaseBlockPool() seriesCachePolicy = ropts.SeriesCachePolicy() timesWithErrors []time.Time nsCtx = namespace.NewContextFrom(ns) - docsPool = s.opts.IndexOptions().DocumentArrayPool() - batch = docsPool.Get() - totalEntries int totalFulfilledRanges = result.NewShardTimeRanges() ) - defer docsPool.Put(batch) requestedRanges := timeWindowReaders.Ranges remainingRanges := requestedRanges.Copy() @@ -421,43 +394,18 @@ func (s *fileSystemSource) loadShardReadersDataIntoShardResult( blockSize = ns.Options().RetentionOptions().BlockSize() err error ) - switch run { - case bootstrapDataRunType: - // Pass, since nothing to do. - case bootstrapIndexRunType: - runResult.addIndexBlockIfNotExists(start, ns) - default: - // Unreachable unless an internal method calls with a run type casted from int. - panic(fmt.Errorf("invalid run type: %d", run)) - } numEntries := r.Entries() for i := 0; err == nil && i < numEntries; i++ { switch run { case bootstrapDataRunType: err = s.readNextEntryAndRecordBlock(nsCtx, accumulator, shard, r, - runResult, start, blockSize, blockPool, seriesCachePolicy) - case bootstrapIndexRunType: - // We can just read the entry and index if performing an index run. - batch, err = s.readNextEntryAndMaybeIndex(r, batch, builder) - if err != nil { - s.log.Error("readNextEntryAndMaybeIndex failed", zap.Error(err), - zap.Time("timeRangeStart", timeRange.Start)) - } - totalEntries++ + start, blockSize, blockPool, seriesCachePolicy) default: // Unreachable unless an internal method calls with a run type casted from int. panic(fmt.Errorf("invalid run type: %d", run)) } } - // NB(bodu): Only flush if we've experienced no errors up to this point. - if err == nil && len(batch) > 0 { - batch, err = builder.FlushBatch(batch) - if err != nil { - s.log.Error("builder FlushBatch failed", zap.Error(err), - zap.Time("timeRangeStart", timeRange.Start)) - } - } if err == nil { // Validate the read results. @@ -469,8 +417,6 @@ func (s *fileSystemSource) loadShardReadersDataIntoShardResult( } else { err = fmt.Errorf("invalid series cache policy: %s", seriesCachePolicy.String()) } - case bootstrapIndexRunType: - validateErr = r.ValidateMetadata() default: // Unreachable unless an internal method calls with a run type casted from int. panic(fmt.Errorf("invalid run type: %d", run)) @@ -480,20 +426,6 @@ func (s *fileSystemSource) loadShardReadersDataIntoShardResult( } } - if err == nil && run == bootstrapIndexRunType { - // Mark index block as fulfilled. - fulfilled := result.NewShardTimeRanges().Set(shard, xtime.NewRanges(timeRange)) - runResult.Lock() - err = runResult.index.IndexResults().MarkFulfilled(start, fulfilled, - // NB(bodu): By default, we always load bootstrapped data into the default index volume. - idxpersist.DefaultIndexVolumeType, ns.Options().IndexOptions()) - runResult.Unlock() - if err != nil { - s.log.Error("indexResults MarkFulfilled failed", zap.Error(err), - zap.Time("timeRangeStart", timeRange.Start)) - } - } - if err == nil { fulfilled := result.NewShardTimeRanges().Set(shard, xtime.NewRanges(timeRange)) totalFulfilledRanges.AddRanges(fulfilled) @@ -506,138 +438,6 @@ func (s *fileSystemSource) loadShardReadersDataIntoShardResult( } } - var ( - noneRemaining = remainingRanges.IsEmpty() - shouldBuildSegment = run == bootstrapIndexRunType && - // NB(r): Do not try to build a segment if no entries to index. - totalEntries > 0 && - len(timesWithErrors) == 0 - ) - if shouldBuildSegment { - var ( - indexBlockSize = ns.Options().IndexOptions().BlockSize() - retentionPeriod = ns.Options().RetentionOptions().RetentionPeriod() - beginningOfIndexRetention = retention.FlushTimeStartForRetentionPeriod( - retentionPeriod, indexBlockSize, s.nowFn()) - initialIndexRange = xtime.Range{ - Start: beginningOfIndexRetention, - End: beginningOfIndexRetention.Add(indexBlockSize), - } - overlapsWithInitalIndexRange = false - min, max = requestedRanges.MinMax() - blockStart = min.Truncate(indexBlockSize) - blockEnd = blockStart.Add(indexBlockSize) - iopts = s.opts.ResultOptions().InstrumentOptions() - indexBlock result.IndexBlock - err error - ) - for _, remainingRange := range remainingRanges.Iter() { - if remainingRange.Overlaps(initialIndexRange) { - overlapsWithInitalIndexRange = true - } - } - - remainingMin, remainingMax := remainingRanges.MinMax() - fulfilledMin, fulfilledMax := totalFulfilledRanges.MinMax() - - // NB(bodu): Assume if we're bootstrapping data from disk that it is the "default" index volume type. - runResult.Lock() - existingIndexBlock, ok := bootstrapper.GetDefaultIndexBlockForBlockStart(runResult.index.IndexResults(), blockStart) - runResult.Unlock() - if !ok { - err := fmt.Errorf("could not find index block in results: time=%s, ts=%d", - blockStart.String(), blockStart.UnixNano()) - instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) { - l.Error("index bootstrap failed", - zap.Error(err), - zap.Stringer("namespace", ns.ID()), - zap.Stringer("requestedRanges", requestedRanges)) - }) - } - - // Determine if should flush data for range. - persistCfg := runOpts.PersistConfig() - shouldFlush := persistCfg.Enabled && - persistCfg.FileSetType == persist.FileSetFlushType - - // Determine all requested ranges were fulfilled or at edge of retention - satisifiedFlushRanges := noneRemaining || overlapsWithInitalIndexRange - - buildIndexLogFields := []zapcore.Field{ - zap.Stringer("namespace", ns.ID()), - zap.Bool("shouldBuildSegment", shouldBuildSegment), - zap.Bool("noneRemaining", noneRemaining), - zap.Bool("overlapsWithInitalIndexRange", overlapsWithInitalIndexRange), - zap.Int("totalEntries", totalEntries), - zap.String("requestedRangesMinMax", fmt.Sprintf("%v - %v", min, max)), - zap.String("remainingRangesMinMax", fmt.Sprintf("%v - %v", remainingMin, remainingMax)), - zap.String("remainingRanges", remainingRanges.SummaryString()), - zap.String("totalFulfilledRangesMinMax", fmt.Sprintf("%v - %v", fulfilledMin, fulfilledMax)), - zap.String("totalFulfilledRanges", totalFulfilledRanges.SummaryString()), - zap.String("initialIndexRange", fmt.Sprintf("%v - %v", initialIndexRange.Start, initialIndexRange.End)), - zap.Bool("shouldFlush", shouldFlush), - zap.Bool("satisifiedFlushRanges", satisifiedFlushRanges), - } - - if shouldFlush && satisifiedFlushRanges { - s.log.Debug("building file set index segment", buildIndexLogFields...) - indexBlock, err = bootstrapper.PersistBootstrapIndexSegment( - ns, - requestedRanges, - builder.Builder(), - persistManager, - s.opts.ResultOptions(), - existingIndexBlock.Fulfilled(), - blockStart, - blockEnd, - ) - if err != nil { - instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) { - l.Error("persist fs index bootstrap failed", - zap.Error(err), - zap.Stringer("namespace", ns.ID()), - zap.Stringer("requestedRanges", requestedRanges)) - }) - } - // Track success. - s.metrics.persistedIndexBlocksWrite.Inc(1) - } else { - s.log.Info("building in-memory index segment", buildIndexLogFields...) - indexBlock, err = bootstrapper.BuildBootstrapIndexSegment( - ns, - requestedRanges, - builder.Builder(), - compactor, - s.opts.ResultOptions(), - s.opts.FilesystemOptions().MmapReporter(), - blockStart, - blockEnd, - ) - if err != nil { - iopts := s.opts.ResultOptions().InstrumentOptions() - instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) { - l.Error("build fs index bootstrap failed", - zap.Error(err), - zap.Stringer("namespace", ns.ID()), - zap.Stringer("requestedRanges", requestedRanges)) - }) - } - } - - // Merge segments and fulfilled time ranges. - segments := indexBlock.Segments() - for _, seg := range existingIndexBlock.Segments() { - segments = append(segments, seg) - } - newFulfilled := existingIndexBlock.Fulfilled().Copy() - newFulfilled.AddRanges(indexBlock.Fulfilled()) - - // Replace index block for default index volume type. - runResult.Lock() - runResult.index.IndexResults()[xtime.ToUnixNano(blockStart)].SetBlock(idxpersist.DefaultIndexVolumeType, result.NewIndexBlock(segments, newFulfilled)) - runResult.Unlock() - } - // Return readers to pool. for _, shardReaders := range shardReaders { for _, r := range shardReaders.Readers { @@ -656,7 +456,6 @@ func (s *fileSystemSource) readNextEntryAndRecordBlock( accumulator bootstrap.NamespaceDataAccumulator, shardID uint32, r fs.DataFileSetReader, - runResult *runResult, blockStart time.Time, blockSize time.Duration, blockPool block.DatabaseBlockPool, @@ -708,41 +507,12 @@ func (s *fileSystemSource) readNextEntryAndRecordBlock( return nil } -func (s *fileSystemSource) readNextEntryAndMaybeIndex( - r fs.DataFileSetReader, - batch []doc.Document, - builder *result.IndexBuilder, -) ([]doc.Document, error) { - // If performing index run, then simply read the metadata and add to segment. - id, tagsIter, _, _, err := r.ReadMetadata() - if err != nil { - return batch, err - } - - d, err := convert.FromSeriesIDAndTagIter(id, tagsIter) - // Finalize the ID and tags. - id.Finalize() - tagsIter.Close() - if err != nil { - return batch, err - } - - batch = append(batch, d) - - if len(batch) >= index.DocumentArrayPoolCapacity { - return builder.FlushBatch(batch) - } - - return batch, nil -} - func (s *fileSystemSource) read( run runType, md namespace.Metadata, accumulator bootstrap.NamespaceDataAccumulator, shardTimeRanges result.ShardTimeRanges, runOpts bootstrap.RunOptions, - builder *result.IndexBuilder, span opentracing.Span, cache bootstrap.Cache, ) (*runResult, error) { @@ -781,12 +551,18 @@ func (s *fileSystemSource) read( ) } if run == bootstrapIndexRunType { + infoFiles := s.readIndexInfoFilesFn(s.fsopts.FilePathPrefix(), md.ID(), + s.fsopts.InfoReaderBufferSize(), persist.FileSetFlushType) + logSpan("bootstrap_from_index_persisted_blocks_start") // NB(r): First read all the FSTs and add to runResult index results, // subtract the shard + time ranges from what we intend to bootstrap // for those we found. - r, err := s.bootstrapFromIndexPersistedBlocks(md, - shardTimeRanges) + r, err := s.bootstrapFromIndexPersistedBlocks( + md, + shardTimeRanges, + infoFiles, + ) if err != nil { s.log.Warn("filesystem bootstrapped failed to read persisted index blocks") } else { @@ -799,90 +575,60 @@ func (s *fileSystemSource) read( logSpan("bootstrap_from_index_persisted_blocks_done") } - // Create a reader pool once per bootstrap as we don't really want to - // allocate and keep around readers outside of the bootstrapping process, - // hence why its created on demand each time. - readerPool := bootstrapper.NewReaderPool(s.newReaderPoolOpts) - indexSegmentConcurrency := s.opts.IndexSegmentConcurrency() - readersCh := make(chan bootstrapper.TimeWindowReaders, indexSegmentConcurrency) - var blockSize time.Duration switch run { case bootstrapDataRunType: - blockSize = md.Options().RetentionOptions().BlockSize() - case bootstrapIndexRunType: - blockSize = md.Options().IndexOptions().BlockSize() - default: - panic(fmt.Errorf("unrecognized run type: %d", run)) - } - runtimeOpts := s.opts.RuntimeOptionsManager().Get() - go bootstrapper.EnqueueReaders(bootstrapper.EnqueueReadersOptions{ - NsMD: md, - RunOpts: runOpts, - RuntimeOpts: runtimeOpts, - FsOpts: s.fsopts, - ShardTimeRanges: shardTimeRanges, - ReaderPool: readerPool, - ReadersCh: readersCh, - BlockSize: blockSize, - // NB(bodu): We only read metadata when bootstrap index - // so we do not need to sort the data fileset reader. - OptimizedReadMetadataOnly: run == bootstrapIndexRunType, - Logger: s.log, - Span: span, - NowFn: s.nowFn, - Cache: cache, - }) - - bootstrapFromReadersRunResult := newRunResult() - - var buildWg sync.WaitGroup - for i := 0; i < indexSegmentConcurrency; i++ { - alloc := s.opts.ResultOptions().IndexDocumentsBuilderAllocator() - segBuilder, err := alloc() - if err != nil { - return nil, err - } + // Create a reader pool once per bootstrap as we don't really want to + // allocate and keep around readers outside of the bootstrapping process, + // hence why its created on demand each time. + readerPool := bootstrapper.NewReaderPool(s.newReaderPoolOpts) + concurrency := s.opts.IndexSegmentConcurrency() + readersCh := make(chan bootstrapper.TimeWindowReaders, concurrency) + + blockSize := md.Options().RetentionOptions().BlockSize() + runtimeOpts := s.opts.RuntimeOptionsManager().Get() + + go bootstrapper.EnqueueReaders(bootstrapper.EnqueueReadersOptions{ + NsMD: md, + RunOpts: runOpts, + RuntimeOpts: runtimeOpts, + FsOpts: s.fsopts, + ShardTimeRanges: shardTimeRanges, + ReaderPool: readerPool, + ReadersCh: readersCh, + BlockSize: blockSize, + Logger: s.log, + Span: span, + NowFn: s.nowFn, + Cache: cache, + }) - builder := result.NewIndexBuilder(segBuilder) - - indexOpts := s.opts.IndexOptions() - compactor, err := compaction.NewCompactor(indexOpts.DocumentArrayPool(), - index.DocumentArrayPoolCapacity, - indexOpts.SegmentBuilderOptions(), - indexOpts.FSTSegmentOptions(), - compaction.CompactorOptions{ - FSTWriterOptions: &fst.WriterOptions{ - // DisableRegistry is set to true to trade a larger FST size - // for a faster FST compaction since we want to reduce the end - // to end latency for time to first index a metric. - DisableRegistry: true, - }, - }) - if err != nil { - return nil, err - } + bootstrapFromReadersRunResult := newRunResult() + var buildWg sync.WaitGroup + for i := 0; i < concurrency; i++ { + buildWg.Add(1) + go func() { + s.bootstrapFromReaders(run, md, accumulator, runOpts, + bootstrapFromReadersRunResult, readerPool, readersCh) + buildWg.Done() + }() - persistManager, err := fs.NewPersistManager(s.opts.FilesystemOptions()) - if err != nil { - return nil, err } - - buildWg.Add(1) - go func() { - s.bootstrapFromReaders(run, md, - accumulator, runOpts, bootstrapFromReadersRunResult, - readerPool, readersCh, builder, - &bootstrapper.SharedPersistManager{Mgr: persistManager}, - &bootstrapper.SharedCompactor{Compactor: compactor}) - buildWg.Done() - }() + buildWg.Wait() + // Merge any existing results if necessary. + setOrMergeResult(bootstrapFromReadersRunResult) + case bootstrapIndexRunType: + // NB(bodu): We no longer persist index blocks for TSDB blocks missing persisted index blocks. + // See bootstrapper README section on "TSDB data on disk missing index data" for more details. + // We just mark these shard time ranges as unfulfilled. The commitlog bootstrapper should later + // mark any initialized shards as fulfilled, leaving only non-initialized shards for the peers + // bootstrapper to fulfill. + result := newRunResult() + result.index.SetUnfulfilled(shardTimeRanges) + setOrMergeResult(result) + default: + panic(fmt.Errorf("unrecognized run type: %d", run)) } - buildWg.Wait() - - // Merge any existing results if necessary. - setOrMergeResult(bootstrapFromReadersRunResult) - return res, nil } @@ -929,14 +675,13 @@ type bootstrapFromIndexPersistedBlocksResult struct { func (s *fileSystemSource) bootstrapFromIndexPersistedBlocks( ns namespace.Metadata, shardTimeRanges result.ShardTimeRanges, + infoFiles []fs.ReadIndexInfoFileResult, ) (bootstrapFromIndexPersistedBlocksResult, error) { res := bootstrapFromIndexPersistedBlocksResult{ fulfilled: result.NewShardTimeRanges(), } indexBlockSize := ns.Options().IndexOptions().BlockSize() - infoFiles := fs.ReadIndexInfoFiles(s.fsopts.FilePathPrefix(), ns.ID(), - s.fsopts.InfoReaderBufferSize()) for _, infoFile := range infoFiles { if err := infoFile.Err.Error(); err != nil { @@ -951,32 +696,12 @@ func (s *fileSystemSource) bootstrapFromIndexPersistedBlocks( info := infoFile.Info indexBlockStart := xtime.UnixNano(info.BlockStart).ToTime() - indexBlockRange := xtime.Range{ - Start: indexBlockStart, - End: indexBlockStart.Add(indexBlockSize), - } - willFulfill := result.NewShardTimeRanges() - for _, shard := range info.Shards { - tr, ok := shardTimeRanges.Get(shard) - if !ok { - // No ranges match for this shard. - continue - } - if _, ok := willFulfill.Get(shard); !ok { - willFulfill.Set(shard, xtime.NewRanges()) - } - - iter := tr.Iter() - for iter.Next() { - curr := iter.Value() - intersection, intersects := curr.Intersect(indexBlockRange) - if !intersects { - continue - } - willFulfill.GetOrAdd(shard).AddRange(intersection) - } - } - + willFulfill := bootstrapper.IntersectingShardTimeRanges( + shardTimeRanges, + info.Shards, + indexBlockStart, + indexBlockSize, + ) if willFulfill.IsEmpty() { // No matching shard/time ranges with this block. continue diff --git a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go index f67c75c82c..2a6bb1d8b4 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_data_test.go @@ -199,20 +199,6 @@ func testShardTimeRanges() result.ShardTimeRanges { return result.NewShardTimeRanges().Set(testShard, testTimeRanges()) } -func testBootstrappingIndexShardTimeRanges() result.ShardTimeRanges { - // NB: since index files are not corrupted on this run, it's expected that - // `testBlockSize` values should be fulfilled in the index block. This is - // `testBlockSize` rather than `testIndexSize` since the files generated - // by this test use 2 hour (which is `testBlockSize`) reader blocks. - return result.NewShardTimeRanges().Set( - testShard, - xtime.NewRanges(xtime.Range{ - Start: testStart.Add(testBlockSize), - End: testStart.Add(11 * time.Hour), - }), - ) -} - func writeGoodFiles(t *testing.T, dir string, namespace ident.ID, shard uint32) { writeGoodFilesWithFsOpts(t, namespace, shard, newTestFsOptions(dir)) } @@ -611,8 +597,7 @@ func TestReadDataCorruptionErrorNoIndex(t *testing.T) { } func TestReadDataCorruptionErrorWithIndex(t *testing.T) { - expectedIndex := testBootstrappingIndexShardTimeRanges() - testReadDataCorruptionErrorWithIndexEnabled(t, true, expectedIndex) + testReadDataCorruptionErrorWithIndexEnabled(t, true, testShardTimeRanges()) } func testReadDataCorruptionErrorWithIndexEnabled( @@ -676,8 +661,7 @@ func TestReadValidateErrorNoIndex(t *testing.T) { } func TestReadValidateErrorWithIndex(t *testing.T) { - expectedIndex := testBootstrappingIndexShardTimeRanges() - testReadValidateErrorWithIndexEnabled(t, true, expectedIndex) + testReadValidateErrorWithIndexEnabled(t, true, testShardTimeRanges()) } func testReadValidateErrorWithIndexEnabled( @@ -754,8 +738,7 @@ func TestReadOpenErrorNoIndex(t *testing.T) { } func TestReadOpenErrorWithIndex(t *testing.T) { - expectedIndex := testBootstrappingIndexShardTimeRanges() - testReadOpenError(t, true, expectedIndex) + testReadOpenError(t, true, testShardTimeRanges()) } func testReadOpenError( @@ -854,10 +837,6 @@ func TestReadDeleteOnError(t *testing.T) { } reader.EXPECT().Open(rOpts).Return(nil).AnyTimes() - reader.EXPECT().ReadMetadata().Return(ident.StringID("foo"), - ident.NewTagsIterator(ident.Tags{}), 0, uint32(0), nil) - reader.EXPECT().ReadMetadata().Return(ident.StringID("bar"), - ident.NewTagsIterator(ident.Tags{}), 0, uint32(0), errors.New("foo")) reader.EXPECT(). Range(). diff --git a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_bench_test.go b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_bench_test.go deleted file mode 100644 index 0f2886912a..0000000000 --- a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_bench_test.go +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2019 Uber Technologies, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -package fs - -import ( - "io/ioutil" - "math" - "net/http" - "net/http/httptest" - _ "net/http/pprof" // pprof: for debug listen server if configured - "os" - "strconv" - "strings" - "testing" - "time" - - "github.com/m3db/m3/src/dbnode/namespace" - "github.com/m3db/m3/src/dbnode/persist" - "github.com/m3db/m3/src/dbnode/persist/fs" - "github.com/m3db/m3/src/dbnode/persist/fs/msgpack" - "github.com/m3db/m3/src/dbnode/storage/bootstrap" - "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" - "github.com/m3db/m3/src/dbnode/storage/series" - xtime "github.com/m3db/m3/src/x/time" - - "github.com/davecgh/go-spew/spew" - "github.com/pkg/profile" - "github.com/stretchr/testify/require" -) - -// BenchmarkBootstrapIndex allows for testing indexing bootstrap time with the -// FS bootstrapper, this tests the speed and performance of index segment -// building from reading a set of files that sit on disk taken from a real -// DB node. -// To use test data and capture CPU profile run with: -// TEST_TSDB_DIR=/tmp/m3db_data PROFILE_CPU=true go test -v -run none -bench Index -func BenchmarkBootstrapIndex(b *testing.B) { - dir, err := ioutil.TempDir("", "var_lib_m3db_fake") - require.NoError(b, err) - defer os.RemoveAll(dir) - - srv := httptest.NewServer(http.DefaultServeMux) - spew.Printf("test server with pprof: %v\n", srv.URL) - - timesOpts := testTimesOptions{ - numBlocks: 2, - } - times := newTestBootstrapIndexTimes(timesOpts) - - testNamespace := testNs1ID - testNamespaceMetadata := testNsMetadata(b) - if testDir := os.Getenv("TEST_TSDB_DIR"); testDir != "" { - spew.Printf("using test dir: %s\n", testDir) - - // Allow for test directory overrides, must name the namespace - // "test_namespace" in the override directory. - dir = testDir - - namespaceDataDirPath := fs.NamespaceDataDirPath(dir, testNamespace) - handle, err := os.Open(namespaceDataDirPath) - require.NoError(b, err) - - results, err := handle.Readdir(0) - require.NoError(b, err) - - require.NoError(b, handle.Close()) - - var shards []uint32 - for _, result := range results { - if !result.IsDir() { - // Looking for shard directories. - spew.Printf("shard discover: entry not directory, %v\n", result.Name()) - continue - } - - v, err := strconv.Atoi(result.Name()) - if err != nil { - // Not a shard directory. - spew.Printf("shard discover: not number, %v, %v\n", result.Name(), err) - continue - } - - shards = append(shards, uint32(v)) - } - - spew.Printf("discovered shards: dir=%v, shards=%v\n", - namespaceDataDirPath, shards) - - // Clear the shard time ranges and add new ones. - times.shardTimeRanges = result.NewShardTimeRanges() - times.start = time.Unix(0, math.MaxInt64) - times.end = time.Unix(0, 0) - for _, shard := range shards { - var ( - min = time.Unix(0, math.MaxInt64) - max = time.Unix(0, 0) - ranges = xtime.NewRanges() - entries = fs.ReadInfoFiles(dir, testNamespace, shard, - 0, msgpack.NewDecodingOptions(), persist.FileSetFlushType) - ) - for _, entry := range entries { - if entry.Err != nil { - require.NoError(b, entry.Err.Error()) - } - - start := time.Unix(0, entry.Info.BlockStart) - if start.Before(min) { - min = start - } - - blockSize := time.Duration(entry.Info.BlockSize) - end := start.Add(blockSize) - if end.After(max) { - max = end - } - - ranges.AddRange(xtime.Range{Start: start, End: end}) - - // Override the block size if different. - namespaceOpts := testNamespaceMetadata.Options() - retentionOpts := namespaceOpts.RetentionOptions() - currBlockSize := retentionOpts.BlockSize() - if blockSize > currBlockSize { - newRetentionOpts := retentionOpts. - SetBlockSize(blockSize). - // 42yrs of retention to make sure blocks are in retention. - // Why 42? Because it's the answer to life, the universe and everything. - SetRetentionPeriod(42 * 365 * 24 * time.Hour) - newIndexOpts := namespaceOpts.IndexOptions().SetBlockSize(blockSize) - newNamespaceOpts := namespaceOpts. - SetRetentionOptions(newRetentionOpts). - SetIndexOptions(newIndexOpts) - testNamespaceMetadata, err = namespace.NewMetadata(testNamespace, newNamespaceOpts) - require.NoError(b, err) - } - } - - if ranges.IsEmpty() { - continue // Nothing to bootstrap for shard. - } - - times.shardTimeRanges.Set(shard, ranges) - - if min.Before(times.start) { - times.start = min - } - if max.After(times.end) { - times.end = max - } - } - } else { - writeTSDBGoodTaggedSeriesDataFiles(b, dir, testNamespace, times.start) - } - - testOpts := newTestOptionsWithPersistManager(b, dir). - SetResultOptions(testDefaultResultOpts.SetSeriesCachePolicy(series.CacheLRU)) - - src, err := newFileSystemSource(testOpts) - require.NoError(b, err) - - runOpts := testDefaultRunOpts. - SetPersistConfig(bootstrap.PersistConfig{ - Enabled: true, - FileSetType: persist.FileSetFlushType, - }) - - tester := bootstrap.BuildNamespacesTester(b, runOpts, - times.shardTimeRanges, testNamespaceMetadata) - defer tester.Finish() - - spew.Printf("running test with times: %v\n", times) - - if strings.ToLower(os.Getenv("PROFILE_CPU")) == "true" { - p := profile.Start(profile.CPUProfile) - defer p.Stop() - } - - b.ResetTimer() - b.StartTimer() - tester.TestReadWith(src) - b.StopTimer() -} diff --git a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_test.go b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_test.go index 14f83aa76d..ceca545e6a 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_test.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/fs/source_index_test.go @@ -21,6 +21,7 @@ package fs import ( + "encoding/binary" "fmt" "os" "testing" @@ -28,8 +29,6 @@ import ( "github.com/m3db/m3/src/dbnode/namespace" "github.com/m3db/m3/src/dbnode/persist" - "github.com/m3db/m3/src/dbnode/persist/fs" - "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/storage/bootstrap" "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" "github.com/m3db/m3/src/dbnode/storage/index/convert" @@ -37,6 +36,7 @@ import ( idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/ident" xtime "github.com/m3db/m3/src/x/time" + "github.com/stretchr/testify/require" "github.com/uber-go/tally" ) @@ -69,9 +69,6 @@ func newTestBootstrapIndexTimes( indexStart = start.Truncate(testIndexBlockSize) } end = start.Add(time.Duration(opts.numBlocks) * testIndexBlockSize) - case 3: - end = at.Truncate(testIndexBlockSize) - start = end.Add(time.Duration(-1*opts.numBlocks) * testBlockSize) default: panic("unexpected") } @@ -93,30 +90,16 @@ func newTestBootstrapIndexTimes( type testSeriesBlocks [][]testSeries -func testGoodTaggedSeriesDataBlocks() testSeriesBlocks { - fooSeries := struct { - id string - tags map[string]string - }{ - "foo", - map[string]string{"aaa": "bbb", "ccc": "ddd"}, - } - dataBlocks := testSeriesBlocks{ - []testSeries{ - {fooSeries.id, fooSeries.tags, []byte{0x1}}, - {"bar", map[string]string{"eee": "fff", "ggg": "hhh"}, []byte{0x1}}, - {"baz", map[string]string{"iii": "jjj", "kkk": "lll"}, []byte{0x1}}, - }, - []testSeries{ - {fooSeries.id, fooSeries.tags, []byte{0x2}}, - {"qux", map[string]string{"mmm": "nnn", "ooo": "ppp"}, []byte{0x2}}, - {"qaz", map[string]string{"qqq": "rrr", "sss": "ttt"}, []byte{0x2}}, - }, - []testSeries{ - {fooSeries.id, fooSeries.tags, []byte{0x3}}, - {"qan", map[string]string{"uuu": "vvv", "www": "xxx"}, []byte{0x3}}, - {"qam", map[string]string{"yyy": "zzz", "000": "111"}, []byte{0x3}}, - }, +func testGoodTaggedSeriesDataBlocks(numBlocks int) testSeriesBlocks { + dataBlocks := make(testSeriesBlocks, 0, numBlocks) + for i := 0; i < numBlocks; i++ { + data := make([]byte, 8) + binary.LittleEndian.PutUint64(data, uint64(i)) + dataBlocks = append(dataBlocks, []testSeries{ + {fmt.Sprintf("foo-%d", i), map[string]string{"aaa": "bbb", "ccc": "ddd"}, data}, + {fmt.Sprintf("bar-%d", i), map[string]string{"eee": "fff", "ggg": "hhh"}, data}, + {fmt.Sprintf("baz-%d", i), map[string]string{"iii": "jjj", "kkk": "lll"}, data}, + }) } return dataBlocks } @@ -126,15 +109,12 @@ func writeTSDBGoodTaggedSeriesDataFiles( dir string, namespaceID ident.ID, start time.Time, + dataBlocks testSeriesBlocks, ) { - dataBlocks := testGoodTaggedSeriesDataBlocks() - - writeTSDBFiles(t, dir, namespaceID, testShard, - start, dataBlocks[0]) - writeTSDBFiles(t, dir, namespaceID, testShard, - start.Add(testBlockSize), dataBlocks[1]) - writeTSDBFiles(t, dir, namespaceID, testShard, - start.Add(2*testBlockSize), dataBlocks[2]) + for i := range dataBlocks { + writeTSDBFiles(t, dir, namespaceID, testShard, + start.Add(time.Duration(i)*testBlockSize), dataBlocks[i]) + } } func writeTSDBPersistedIndexBlock( @@ -167,7 +147,7 @@ func writeTSDBPersistedIndexBlock( flush, err := pm.StartIndexPersist() require.NoError(t, err) - preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{ + preparedPersist, err := flush.PrepareIndexFlush(persist.IndexPrepareOptions{ NamespaceMetadata: namespace, BlockStart: start, FileSetType: persist.FileSetFlushType, @@ -189,122 +169,7 @@ func writeTSDBPersistedIndexBlock( require.NoError(t, err) } -type expectedTaggedSeries struct { - indexBlockStart time.Time - series map[string]testSeries -} - -func expectedTaggedSeriesWithOptions( - t require.TestingT, - start time.Time, - opts testTimesOptions, -) []expectedTaggedSeries { - dataBlocks := testGoodTaggedSeriesDataBlocks() - switch opts.numBlocks { - case 2: - return []expectedTaggedSeries{ - { - indexBlockStart: start, - series: map[string]testSeries{ - dataBlocks[0][0].id: dataBlocks[0][0], - dataBlocks[0][1].id: dataBlocks[0][1], - dataBlocks[0][2].id: dataBlocks[0][2], - dataBlocks[1][1].id: dataBlocks[1][1], - dataBlocks[1][2].id: dataBlocks[1][2], - }, - }, - { - indexBlockStart: start.Add(testIndexBlockSize), - series: map[string]testSeries{ - dataBlocks[2][0].id: dataBlocks[2][0], - dataBlocks[2][1].id: dataBlocks[2][1], - dataBlocks[2][2].id: dataBlocks[2][2], - }, - }, - } - case 3: - return []expectedTaggedSeries{ - { - indexBlockStart: start, - series: map[string]testSeries{ - dataBlocks[0][0].id: dataBlocks[0][0], - dataBlocks[0][1].id: dataBlocks[0][1], - dataBlocks[0][2].id: dataBlocks[0][2], - }, - }, - { - indexBlockStart: start.Add(testIndexBlockSize), - series: map[string]testSeries{ - dataBlocks[1][1].id: dataBlocks[1][1], - dataBlocks[1][2].id: dataBlocks[1][2], - dataBlocks[2][0].id: dataBlocks[2][0], - dataBlocks[2][1].id: dataBlocks[2][1], - dataBlocks[2][2].id: dataBlocks[2][2], - }, - }, - } - default: - require.FailNow(t, "unsupported test times options") - } - return nil -} - -func validateGoodTaggedSeries( - t require.TestingT, - start time.Time, - indexResults result.IndexResults, - opts testTimesOptions, -) { - require.Equal(t, 2, len(indexResults)) - - expectedSeriesByBlock := expectedTaggedSeriesWithOptions(t, start, opts) - for _, expected := range expectedSeriesByBlock { - expectedAt := xtime.ToUnixNano(expected.indexBlockStart) - indexBlockByVolumeType, ok := indexResults[expectedAt] - require.True(t, ok) - for _, indexBlock := range indexBlockByVolumeType.Iter() { - require.Equal(t, 1, len(indexBlock.Segments())) - for _, seg := range indexBlock.Segments() { - reader, err := seg.Segment().Reader() - require.NoError(t, err) - - docs, err := reader.AllDocs() - require.NoError(t, err) - - matches := map[string]struct{}{} - for docs.Next() { - curr := docs.Current() - - _, ok := matches[string(curr.ID)] - require.False(t, ok) - matches[string(curr.ID)] = struct{}{} - - series, ok := expected.series[string(curr.ID)] - require.True(t, ok) - - matchingTags := map[string]struct{}{} - for _, tag := range curr.Fields { - _, ok := matchingTags[string(tag.Name)] - require.False(t, ok) - matchingTags[string(tag.Name)] = struct{}{} - - tagValue, ok := series.tags[string(tag.Name)] - require.True(t, ok) - - require.Equal(t, tagValue, string(tag.Value)) - } - require.Equal(t, len(series.tags), len(matchingTags)) - } - require.NoError(t, docs.Err()) - require.NoError(t, docs.Close()) - - require.Equal(t, len(expected.series), len(matches)) - } - } - } -} - -func TestBootstrapIndex(t *testing.T) { +func TestBootstrapIndexPartiallyFulfilled(t *testing.T) { dir := createTempDir(t) defer os.RemoveAll(dir) @@ -313,15 +178,20 @@ func TestBootstrapIndex(t *testing.T) { } times := newTestBootstrapIndexTimes(timesOpts) - writeTSDBGoodTaggedSeriesDataFiles(t, dir, testNs1ID, times.start) + testData := testGoodTaggedSeriesDataBlocks(4) + writeTSDBGoodTaggedSeriesDataFiles(t, dir, testNs1ID, times.start, testData) + // Index block size is 4h and data block size is 2h. + // So write the first two data blocks to disk as first index block. + // We later ensure that second index block is completely unfulfilled. + shards := map[uint32]struct{}{testShard: struct{}{}} + writeTSDBPersistedIndexBlock(t, dir, testNsMetadata(t), times.start, shards, + append(testData[0], testData[1]...)) opts := newTestOptionsWithPersistManager(t, dir) scope := tally.NewTestScope("", nil) opts = opts.SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(scope)) - // Should always be run with persist enabled. - runOpts := testDefaultRunOpts. - SetPersistConfig(bootstrap.PersistConfig{Enabled: true}) + runOpts := testDefaultRunOpts fsSrc, err := newFileSystemSource(opts) require.NoError(t, err) @@ -335,123 +205,38 @@ func TestBootstrapIndex(t *testing.T) { defer tester.Finish() tester.TestReadWith(src) - indexResults := tester.ResultForNamespace(nsMD.ID()).IndexResult.IndexResults() - - // Check that single persisted segment got written out - infoFiles := fs.ReadIndexInfoFiles(src.fsopts.FilePathPrefix(), testNs1ID, - src.fsopts.InfoReaderBufferSize()) - require.Equal(t, 1, len(infoFiles)) - - for _, infoFile := range infoFiles { - require.NoError(t, infoFile.Err.Error()) - require.Equal(t, times.start.UnixNano(), infoFile.Info.BlockStart) - require.Equal(t, testIndexBlockSize, time.Duration(infoFile.Info.BlockSize)) - require.Equal(t, persist.FileSetFlushType, persist.FileSetType(infoFile.Info.FileType)) - require.Equal(t, 1, len(infoFile.Info.Segments)) - require.Equal(t, 1, len(infoFile.Info.Shards)) - require.Equal(t, testShard, infoFile.Info.Shards[0]) - } + results := tester.ResultForNamespace(nsMD.ID()).IndexResult - // Check that the segment is not a mutable segment for this block - blockByVolumeType, ok := indexResults[xtime.ToUnixNano(times.start)] + // Check that the ndex segment for the persisted index block is in the results + // and came from disk. + blockByVolumeType, ok := results.IndexResults()[xtime.ToUnixNano(times.start)] require.True(t, ok) block, ok := blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) require.True(t, ok) require.Equal(t, 1, len(block.Segments())) - segment := block.Segments()[0] - require.True(t, ok) - require.True(t, segment.IsPersisted()) - - // Check that the second segment is mutable and was not written out - blockByVolumeType, ok = indexResults[xtime.ToUnixNano(times.start.Add(testIndexBlockSize))] - require.True(t, ok) - block, ok = blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment = block.Segments()[0] - require.True(t, ok) - require.False(t, segment.IsPersisted()) - - // Validate results - validateGoodTaggedSeries(t, times.start, indexResults, timesOpts) - - // Validate that wrote the block out (and no index blocks - // were read as existing index blocks on disk) - counters := scope.Snapshot().Counters() - require.Equal(t, int64(0), counters["fs-bootstrapper.persist-index-blocks-read+"].Value()) - require.Equal(t, int64(1), counters["fs-bootstrapper.persist-index-blocks-write+"].Value()) -} - -func TestBootstrapIndexIgnoresPersistConfigIfSnapshotType(t *testing.T) { - dir := createTempDir(t) - defer os.RemoveAll(dir) - - timesOpts := testTimesOptions{ - numBlocks: 2, + for _, seg := range block.Segments() { + require.True(t, seg.IsPersisted()) } - times := newTestBootstrapIndexTimes(timesOpts) - - writeTSDBGoodTaggedSeriesDataFiles(t, dir, testNs1ID, times.start) - - opts := newTestOptionsWithPersistManager(t, dir) - scope := tally.NewTestScope("", nil) - opts = opts.SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(scope)) - - runOpts := testDefaultRunOpts. - SetPersistConfig(bootstrap.PersistConfig{ - Enabled: true, - FileSetType: persist.FileSetSnapshotType, - }) - - fsSrc, err := newFileSystemSource(opts) - require.NoError(t, err) - src, ok := fsSrc.(*fileSystemSource) - require.True(t, ok) - - nsMD := testNsMetadata(t) - tester := bootstrap.BuildNamespacesTesterWithFilesystemOptions(t, runOpts, - times.shardTimeRanges, opts.FilesystemOptions(), nsMD) - defer tester.Finish() - - tester.TestReadWith(src) - indexResults := tester.ResultForNamespace(nsMD.ID()).IndexResult.IndexResults() - - // Check that not segments were written out - infoFiles := fs.ReadIndexInfoFiles(src.fsopts.FilePathPrefix(), testNs1ID, - src.fsopts.InfoReaderBufferSize()) - require.Equal(t, 0, len(infoFiles)) - - // Check that both segments are mutable - blockByVolumeType, ok := indexResults[xtime.ToUnixNano(times.start)] - require.True(t, ok) - block, ok := blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment := block.Segments()[0] - require.True(t, ok) - require.False(t, segment.IsPersisted()) - - blockByVolumeType, ok = indexResults[xtime.ToUnixNano(times.start.Add(testIndexBlockSize))] - require.True(t, ok) - block, ok = blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment = block.Segments()[0] - require.True(t, ok) - require.False(t, segment.IsPersisted()) + // Ensure that the second index block does not exist in the results. + _, ok = results.IndexResults()[xtime.ToUnixNano(times.start.Add(testIndexBlockSize))] + require.False(t, ok) - // Validate results - validateGoodTaggedSeries(t, times.start, indexResults, timesOpts) + // Validate that read the block and that the second index block is unfulfilled. + expectedUnfulfilled := result.NewShardTimeRanges().Set( + testShard, + xtime.NewRanges(xtime.Range{ + Start: times.start.Add(testIndexBlockSize), + End: times.start.Add(2 * testIndexBlockSize), + }), + ) + require.True(t, expectedUnfulfilled.Equal(results.Unfulfilled())) - // Validate that no index blocks were read from disk and that no files were written out counters := scope.Snapshot().Counters() - require.Equal(t, int64(0), counters["fs-bootstrapper.persist-index-blocks-read+"].Value()) - require.Equal(t, int64(0), counters["fs-bootstrapper.persist-index-blocks-write+"].Value()) - tester.EnsureNoWrites() + require.Equal(t, int64(1), counters["fs-bootstrapper.persist-index-blocks-read+"].Value()) } -func TestBootstrapIndexWithPersistPrefersPersistedIndexBlocks(t *testing.T) { +func TestBootstrapIndexUnfulfilled(t *testing.T) { dir := createTempDir(t) defer os.RemoveAll(dir) @@ -460,21 +245,15 @@ func TestBootstrapIndexWithPersistPrefersPersistedIndexBlocks(t *testing.T) { } times := newTestBootstrapIndexTimes(timesOpts) - // Write data files - writeTSDBGoodTaggedSeriesDataFiles(t, dir, testNs1ID, times.start) - - // Now write index block segment from first two data blocks - testData := testGoodTaggedSeriesDataBlocks() - shards := map[uint32]struct{}{testShard: struct{}{}} - writeTSDBPersistedIndexBlock(t, dir, testNsMetadata(t), times.start, shards, - append(testData[0], testData[1]...)) + testData := testGoodTaggedSeriesDataBlocks(4) + writeTSDBGoodTaggedSeriesDataFiles(t, dir, testNs1ID, times.start, testData) + // No index data persisted to disk, we ensure later that the entire range is unfulfilled. opts := newTestOptionsWithPersistManager(t, dir) scope := tally.NewTestScope("", nil) opts = opts.SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(scope)) - runOpts := testDefaultRunOpts. - SetPersistConfig(bootstrap.PersistConfig{Enabled: true}) + runOpts := testDefaultRunOpts fsSrc, err := newFileSystemSource(opts) require.NoError(t, err) @@ -488,169 +267,20 @@ func TestBootstrapIndexWithPersistPrefersPersistedIndexBlocks(t *testing.T) { defer tester.Finish() tester.TestReadWith(src) - indexResults := tester.ResultForNamespace(nsMD.ID()).IndexResult.IndexResults() - - // Check that the segment is not a mutable segment for this block - // and came from disk - blockByVolumeType, ok := indexResults[xtime.ToUnixNano(times.start)] - require.True(t, ok) - block, ok := blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment := block.Segments()[0] - require.True(t, ok) - require.True(t, segment.IsPersisted()) + results := tester.ResultForNamespace(nsMD.ID()).IndexResult - // Check that the second segment is mutable - blockByVolumeType, ok = indexResults[xtime.ToUnixNano(times.start.Add(testIndexBlockSize))] - require.True(t, ok) - block, ok = blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment = block.Segments()[0] - require.True(t, ok) - require.False(t, segment.IsPersisted()) - - // Validate results - validateGoodTaggedSeries(t, times.start, indexResults, timesOpts) - - // Validate that read the block and no blocks were written - // (ensure persist config didn't write it back out again) - counters := scope.Snapshot().Counters() - require.Equal(t, int64(1), counters["fs-bootstrapper.persist-index-blocks-read+"].Value()) - require.Equal(t, int64(0), counters["fs-bootstrapper.persist-index-blocks-write+"].Value()) - tester.EnsureNoWrites() -} - -// TODO: Make this test actually exercise the case at the retention edge, -// right now it only builds a partial segment for the second of three index -// blocks it is trying to build. -func TestBootstrapIndexWithPersistForIndexBlockAtRetentionEdge(t *testing.T) { - dir := createTempDir(t) - defer os.RemoveAll(dir) - - timesOpts := testTimesOptions{ - numBlocks: 3, - } - times := newTestBootstrapIndexTimes(timesOpts) - firstIndexBlockStart := times.start.Truncate(testIndexBlockSize) - - writeTSDBGoodTaggedSeriesDataFiles(t, dir, testNs1ID, times.start) - - opts := newTestOptionsWithPersistManager(t, dir) + // Ensure that the first index block does not exist in the results. + _, ok = results.IndexResults()[xtime.ToUnixNano(times.start)] + require.False(t, ok) - scope := tally.NewTestScope("", nil) - opts = opts. - SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(scope)) - - at := time.Now() - resultOpts := opts.ResultOptions() - clockOpts := resultOpts.ClockOptions(). - SetNowFn(func() time.Time { - return at - }) - opts = opts.SetResultOptions(resultOpts.SetClockOptions(clockOpts)) - - runOpts := testDefaultRunOpts. - SetPersistConfig(bootstrap.PersistConfig{Enabled: true}) - - fsSrc, err := newFileSystemSource(opts) - require.NoError(t, err) - - src, ok := fsSrc.(*fileSystemSource) - require.True(t, ok) - - retentionPeriod := testBlockSize - for { - // Make sure that retention is set to end half way through the first block - flushStart := retention.FlushTimeStartForRetentionPeriod(retentionPeriod, testBlockSize, at) - if flushStart.Before(firstIndexBlockStart.Add(testIndexBlockSize)) { - break - } - retentionPeriod += testBlockSize - } - - ropts := testRetentionOptions. - SetBlockSize(testBlockSize). - SetRetentionPeriod(retentionPeriod) - ns, err := namespace.NewMetadata(testNs1ID, testNamespaceOptions. - SetRetentionOptions(ropts). - SetIndexOptions(testNamespaceIndexOptions. - SetEnabled(true). - SetBlockSize(testIndexBlockSize))) - require.NoError(t, err) - - // NB(bodu): Simulate requesting bootstrapping of two whole index blocks instead of 3 data blocks (1.5 index blocks). - times.shardTimeRanges = result.NewShardTimeRanges().Set( - testShard, - xtime.NewRanges(xtime.Range{ - Start: firstIndexBlockStart, - End: times.end, - }), - ) - tester := bootstrap.BuildNamespacesTesterWithFilesystemOptions(t, runOpts, - times.shardTimeRanges, opts.FilesystemOptions(), ns) - defer tester.Finish() - - tester.TestReadWith(src) - indexResults := tester.ResultForNamespace(ns.ID()).IndexResult.IndexResults() - - // Check that single persisted segment got written out - infoFiles := fs.ReadIndexInfoFiles(src.fsopts.FilePathPrefix(), testNs1ID, - src.fsopts.InfoReaderBufferSize()) - require.Equal(t, 2, len(infoFiles)) - - for _, infoFile := range infoFiles { - require.NoError(t, infoFile.Err.Error()) - - if infoFile.Info.BlockStart == firstIndexBlockStart.UnixNano() { - expectedStart := times.end.Add(-2 * testIndexBlockSize).UnixNano() - require.Equal(t, expectedStart, infoFile.Info.BlockStart, - fmt.Sprintf("expected=%v, actual=%v", - time.Unix(0, expectedStart).String(), - time.Unix(0, infoFile.Info.BlockStart))) - } else { - expectedStart := times.end.Add(-1 * testIndexBlockSize).UnixNano() - require.Equal(t, expectedStart, infoFile.Info.BlockStart, - fmt.Sprintf("expected=%v, actual=%v", - time.Unix(0, expectedStart).String(), - time.Unix(0, infoFile.Info.BlockStart))) - } - - require.Equal(t, testIndexBlockSize, time.Duration(infoFile.Info.BlockSize)) - require.Equal(t, persist.FileSetFlushType, persist.FileSetType(infoFile.Info.FileType)) - require.Equal(t, 1, len(infoFile.Info.Segments)) - require.Equal(t, 1, len(infoFile.Info.Shards)) - require.Equal(t, testShard, infoFile.Info.Shards[0]) - } - - // Check that the segment is not a mutable segment - blockByVolumeType, ok := indexResults[xtime.ToUnixNano(firstIndexBlockStart)] - require.True(t, ok) - block, ok := blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment := block.Segments()[0] - require.True(t, ok) - require.True(t, segment.IsPersisted()) - - // Check that the second is not a mutable segment - blockByVolumeType, ok = indexResults[xtime.ToUnixNano(firstIndexBlockStart.Add(testIndexBlockSize))] - require.True(t, ok) - block, ok = blockByVolumeType.GetBlock(idxpersist.DefaultIndexVolumeType) - require.True(t, ok) - require.Equal(t, 1, len(block.Segments())) - segment = block.Segments()[0] - require.True(t, ok) - require.True(t, segment.IsPersisted()) + // Ensure that the second index block does not exist in the results. + _, ok = results.IndexResults()[xtime.ToUnixNano(times.start.Add(testIndexBlockSize))] + require.False(t, ok) - // Validate results - validateGoodTaggedSeries(t, firstIndexBlockStart, indexResults, timesOpts) + // Validate that read the block and that the second index block is unfulfilled. + expectedUnfulfilled := times.shardTimeRanges + require.True(t, expectedUnfulfilled.Equal(results.Unfulfilled())) - // Validate that wrote the block out (and no index blocks - // were read as existing index blocks on disk) counters := scope.Snapshot().Counters() require.Equal(t, int64(0), counters["fs-bootstrapper.persist-index-blocks-read+"].Value()) - require.Equal(t, int64(2), counters["fs-bootstrapper.persist-index-blocks-write+"].Value()) - tester.EnsureNoWrites() } diff --git a/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go b/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go index 0ab602644e..a3d2469cfb 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go @@ -455,7 +455,9 @@ func (s *peersSource) fetchBootstrapBlocksFromPeers( } for _, block := range entry.Blocks.AllBlocks() { - if err := ref.Series.LoadBlock(block, series.WarmWrite); err != nil { + // NB(bodu): Since we aren't fetching peer index blocks, we must index all fetched data blocks + // during the loading phase. + if err := ref.Series.LoadBlockAndIndex(block, series.WarmWrite); err != nil { unfulfill(currRange) s.log.Error("could not load series block", zap.Error(err)) } @@ -943,6 +945,8 @@ func (s *peersSource) processReaders( }) } } else { + // TODO(bodu): We can remove the in-memory index segment building code path once + // we've fully migrated to 1:1 sizing of TSDB and index blocks. s.log.Info("building in-memory index segment", buildIndexLogFields...) indexBlock, err = bootstrapper.BuildBootstrapIndexSegment( ns, diff --git a/src/dbnode/storage/bootstrap/bootstrapper/persist.go b/src/dbnode/storage/bootstrap/bootstrapper/persist.go index acb908c7f0..716c747aec 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/persist.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/persist.go @@ -160,7 +160,7 @@ func persistBootstrapIndexSegment( } }() - preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{ + preparedPersist, err := flush.PrepareIndexFlush(persist.IndexPrepareOptions{ NamespaceMetadata: ns, BlockStart: blockStart, FileSetType: persist.FileSetFlushType, diff --git a/src/dbnode/storage/bootstrap/bootstrapper/ranges.go b/src/dbnode/storage/bootstrap/bootstrapper/ranges.go index 6c394b9e63..e2754022bb 100644 --- a/src/dbnode/storage/bootstrap/bootstrapper/ranges.go +++ b/src/dbnode/storage/bootstrap/bootstrapper/ranges.go @@ -78,3 +78,39 @@ func minTime(x, y time.Time) time.Time { } return y } + +// IntersectingShardTimeRanges gets intersecting shard time ranges across shards +// for requested shard time ranges and a given block start x block size. +func IntersectingShardTimeRanges( + shardTimeRanges result.ShardTimeRanges, + shards []uint32, + blockStart time.Time, + blockSize time.Duration, +) result.ShardTimeRanges { + blockRange := xtime.Range{ + Start: blockStart, + End: blockStart.Add(blockSize), + } + willFulfill := result.NewShardTimeRanges() + for _, shard := range shards { + tr, ok := shardTimeRanges.Get(shard) + if !ok { + // No ranges match for this shard. + continue + } + if _, ok := willFulfill.Get(shard); !ok { + willFulfill.Set(shard, xtime.NewRanges()) + } + + iter := tr.Iter() + for iter.Next() { + curr := iter.Value() + intersection, intersects := curr.Intersect(blockRange) + if !intersects { + continue + } + willFulfill.GetOrAdd(shard).AddRange(intersection) + } + } + return willFulfill +} diff --git a/src/dbnode/storage/bootstrap/bootstrapper/ranges_test.go b/src/dbnode/storage/bootstrap/bootstrapper/ranges_test.go new file mode 100644 index 0000000000..613f41254b --- /dev/null +++ b/src/dbnode/storage/bootstrap/bootstrapper/ranges_test.go @@ -0,0 +1,48 @@ +// Copyright (c) 2020 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package bootstrapper + +import ( + "testing" + "time" + + "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" + "github.com/stretchr/testify/require" +) + +func TestIntersectingShardTimeRanges(t *testing.T) { + shards := []uint32{0, 1, 2, 3} + blockSize := time.Hour + t0 := time.Now().Truncate(blockSize) + t1 := t0.Add(blockSize) + t2 := t1.Add(blockSize) + fullRange := result.NewShardTimeRangesFromRange(t0, t2, shards...) + + expectedIntersect := result.NewShardTimeRangesFromRange(t1, t2, shards...) + intersect := IntersectingShardTimeRanges(fullRange, shards, t1, blockSize) + require.True(t, intersect.Equal(expectedIntersect)) + + // Try with non-overlapping shards. + intersectShards := []uint32{0} + expectedIntersect = result.NewShardTimeRangesFromRange(t1, t2, intersectShards...) + intersect = IntersectingShardTimeRanges(fullRange, intersectShards, t1, blockSize) + require.True(t, intersect.Equal(expectedIntersect)) +} diff --git a/src/dbnode/storage/bootstrap/result/result_index.go b/src/dbnode/storage/bootstrap/result/result_index.go index 1a398139a1..b113b5dd72 100644 --- a/src/dbnode/storage/bootstrap/result/result_index.go +++ b/src/dbnode/storage/bootstrap/result/result_index.go @@ -33,6 +33,11 @@ import ( xtime "github.com/m3db/m3/src/x/time" ) +const ( + // Volume index of -1 means unset. + volumeIndexUnset = -1 +) + // NewDefaultDocumentsBuilderAllocator returns a default mutable segment // allocator. func NewDefaultDocumentsBuilderAllocator() DocumentsBuilderAllocator { @@ -258,8 +263,9 @@ func NewIndexBlock( fulfilled = NewShardTimeRanges() } return IndexBlock{ - segments: segments, - fulfilled: fulfilled, + segments: segments, + fulfilled: fulfilled, + volumeIndex: volumeIndexUnset, } } @@ -288,6 +294,16 @@ func (b IndexBlock) Merged(other IndexBlock) IndexBlock { return r } +// VolumeIndex returns the volume index (if set) of the persisted index fileset. +func (b IndexBlock) VolumeIndex() int { + return b.volumeIndex +} + +// SetVolumeIndex sets the volume index of the persisted index fileset. +func (b IndexBlock) SetVolumeIndex(i int) { + b.volumeIndex = i +} + // NewIndexBlockByVolumeType returns a new bootstrap index blocks by volume type result. func NewIndexBlockByVolumeType(blockStart time.Time) IndexBlockByVolumeType { return IndexBlockByVolumeType{ @@ -312,6 +328,11 @@ func (b IndexBlockByVolumeType) SetBlock(volumeType persist.IndexVolumeType, blo b.data[volumeType] = block } +// DeleteBlock deletes an index block volume type if exists. +func (b IndexBlockByVolumeType) DeleteBlock(volumeType persist.IndexVolumeType) { + delete(b.data, volumeType) +} + // Iter returns the underlying iterable map data. func (b IndexBlockByVolumeType) Iter() map[persist.IndexVolumeType]IndexBlock { return b.data diff --git a/src/dbnode/storage/bootstrap/result/types.go b/src/dbnode/storage/bootstrap/result/types.go index 423553ee22..000992de76 100644 --- a/src/dbnode/storage/bootstrap/result/types.go +++ b/src/dbnode/storage/bootstrap/result/types.go @@ -76,8 +76,9 @@ type IndexBlockByVolumeType struct { // IndexBlock is an index block for a index volume type. type IndexBlock struct { - segments []Segment - fulfilled ShardTimeRanges + segments []Segment + fulfilled ShardTimeRanges + volumeIndex int } // Segment wraps an index segment so we can easily determine whether or not the segment is persisted to disk. diff --git a/src/dbnode/storage/bootstrap/types.go b/src/dbnode/storage/bootstrap/types.go index 0a64face66..175922003c 100644 --- a/src/dbnode/storage/bootstrap/types.go +++ b/src/dbnode/storage/bootstrap/types.go @@ -495,4 +495,11 @@ type SeriesRef interface { block block.DatabaseBlock, writeType series.WriteType, ) error + + // LoadBlockAndIndex loads a single block into the series and attempts to index the series + // if not already attempted. + LoadBlockAndIndex( + block block.DatabaseBlock, + writeType series.WriteType, + ) error } diff --git a/src/dbnode/storage/bootstrap/util.go b/src/dbnode/storage/bootstrap/util.go index 215d33136d..439974cf75 100644 --- a/src/dbnode/storage/bootstrap/util.go +++ b/src/dbnode/storage/bootstrap/util.go @@ -235,26 +235,30 @@ func (a *TestDataAccumulator) checkoutSeriesWithLock( } var streamErr error - mockSeries := series.NewMockDatabaseSeries(a.ctrl) + mockSeries := NewMockSeriesRef(a.ctrl) - mockSeries.EXPECT(). - LoadBlock(gomock.Any(), gomock.Any()). - DoAndReturn(func(bl block.DatabaseBlock, _ series.WriteType) error { - reader, err := bl.Stream(context.NewContext()) - if err != nil { - streamErr = err - return err - } + loadBlockReturnFn := func(bl block.DatabaseBlock, _ series.WriteType) error { + reader, err := bl.Stream(context.NewContext()) + if err != nil { + streamErr = err + return err + } - a.loadedBlockMap[stringID] = append(a.loadedBlockMap[stringID], - ReaderAtTime{ - Start: bl.StartTime(), - Reader: reader, - Tags: decodedTags, - }) + a.loadedBlockMap[stringID] = append(a.loadedBlockMap[stringID], + ReaderAtTime{ + Start: bl.StartTime(), + Reader: reader, + Tags: decodedTags, + }) - return nil - }).AnyTimes() + return nil + } + mockSeries.EXPECT(). + LoadBlock(gomock.Any(), gomock.Any()). + DoAndReturn(loadBlockReturnFn).AnyTimes() + mockSeries.EXPECT(). + LoadBlockAndIndex(gomock.Any(), gomock.Any()). + DoAndReturn(loadBlockReturnFn).AnyTimes() mockSeries.EXPECT().Write( gomock.Any(), gomock.Any(), gomock.Any(), diff --git a/src/dbnode/storage/cleanup.go b/src/dbnode/storage/cleanup.go index d91db51329..57a6b81859 100644 --- a/src/dbnode/storage/cleanup.go +++ b/src/dbnode/storage/cleanup.go @@ -33,6 +33,7 @@ import ( "github.com/m3db/m3/src/dbnode/retention" xerrors "github.com/m3db/m3/src/x/errors" "github.com/m3db/m3/src/x/ident" + xtime "github.com/m3db/m3/src/x/time" "github.com/pborman/uuid" "github.com/uber-go/tally" @@ -42,8 +43,6 @@ import ( type commitLogFilesFn func(commitlog.Options) (persist.CommitLogFiles, []commitlog.ErrorWithPath, error) type snapshotMetadataFilesFn func(fs.Options) ([]fs.SnapshotMetadata, []fs.SnapshotMetadataErrorWithPaths, error) -type snapshotFilesFn func(filePathPrefix string, namespace ident.ID, shard uint32) (fs.FileSetFilesSlice, error) - type deleteFilesFn func(files []string) error type deleteInactiveDirectoriesFn func(parentDirPath string, activeDirNames []string) error @@ -66,7 +65,8 @@ type cleanupManager struct { commitLogsDir string commitLogFilesFn commitLogFilesFn snapshotMetadataFilesFn snapshotMetadataFilesFn - snapshotFilesFn snapshotFilesFn + snapshotFilesFn fs.SnapshotFilesFn + indexSnapshotFilesFn fs.IndexSnapshotFilesFn deleteFilesFn deleteFilesFn deleteInactiveDirectoriesFn deleteInactiveDirectoriesFn @@ -120,6 +120,7 @@ func newCleanupManager( commitLogFilesFn: commitlog.Files, snapshotMetadataFilesFn: fs.SortedSnapshotMetadataFiles, snapshotFilesFn: fs.SnapshotFiles, + indexSnapshotFilesFn: fs.IndexSnapshotFiles, deleteFilesFn: fs.DeleteFiles, deleteInactiveDirectoriesFn: fs.DeleteInactiveDirectories, metrics: newCleanupManagerMetrics(scope), @@ -170,9 +171,17 @@ func (m *cleanupManager) WarmFlushCleanup(t time.Time, isBootstrapped bool) erro "encountered errors when deleting inactive namespace files for %v: %v", t, err)) } - if err := m.cleanupSnapshotsAndCommitlogs(namespaces); err != nil { + // NB(bodu): Cleanup of index && data snapshots MUST happen during warm flush cleanup + // because cleanup cannot happen concurrently w/ snapshotting as we could be removing latest snapshots + // before the latest snapshot metadata gets written to disk. + if err := m.cleanupIndexSnapshots(namespaces); err != nil { + multiErr = multiErr.Add(fmt.Errorf( + "encountered errors when cleaning up index snapshot files: %v", err)) + } + + if err := m.cleanupDataSnapshotsAndCommitlogs(namespaces); err != nil { multiErr = multiErr.Add(fmt.Errorf( - "encountered errors when cleaning up snapshot and commitlog files: %v", err)) + "encountered errors when cleaning up data snapshot and commitlog files: %v", err)) } return multiErr.FinalError() @@ -272,6 +281,7 @@ func (m *cleanupManager) deleteInactiveDataFileSetFiles(filesetFilesDirPathFn fu return multiErr.FinalError() } +// cleanupDataFiles cleans up out of retention and compacted data file sets. func (m *cleanupManager) cleanupDataFiles(t time.Time, namespaces []databaseNamespace) error { multiErr := xerrors.NewMultiError() for _, n := range namespaces { @@ -340,7 +350,7 @@ func (m *cleanupManager) cleanupCompactedNamespaceDataFiles(shards []databaseSha return multiErr.FinalError() } -// The goal of the cleanupSnapshotsAndCommitlogs function is to delete all snapshots files, snapshot metadata +// The goal of the cleanupDataSnapshotsAndCommitlogs function is to delete all data snapshots files, snapshot metadata // files, and commitlog files except for those that are currently required for recovery from a node failure. // According to the snapshotting / commitlog rotation logic, the files that are required for a complete // recovery are: @@ -370,40 +380,13 @@ func (m *cleanupManager) cleanupCompactedNamespaceDataFiles(shards []databaseSha // 9. Delete all corrupt commitlog files (ignoring any commitlog files being actively written to.) // // This process is also modeled formally in TLA+ in the file `SnapshotsSpec.tla`. -func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseNamespace) (finalErr error) { - logger := m.opts.InstrumentOptions().Logger().With( +func (m *cleanupManager) cleanupDataSnapshotsAndCommitlogs(namespaces []databaseNamespace) (finalErr error) { + m.logger.With( zap.String("comment", "partial/corrupt files are expected as result of a restart (this is ok)"), ) - fsOpts := m.opts.CommitLogOptions().FilesystemOptions() - snapshotMetadatas, snapshotMetadataErrorsWithPaths, err := m.snapshotMetadataFilesFn(fsOpts) - if err != nil { - return err - } - - if len(snapshotMetadatas) == 0 { - // No cleanup can be performed until we have at least one complete snapshot. - return nil - } - - // They should technically already be sorted, but better to be safe. - sort.Slice(snapshotMetadatas, func(i, j int) bool { - return snapshotMetadatas[i].ID.Index < snapshotMetadatas[j].ID.Index - }) - sortedSnapshotMetadatas := snapshotMetadatas - - // Sanity check. - lastMetadataIndex := int64(-1) - for _, snapshotMetadata := range sortedSnapshotMetadatas { - currIndex := snapshotMetadata.ID.Index - if currIndex == lastMetadataIndex { - // Should never happen. - return fmt.Errorf( - "found two snapshot metadata files with duplicate index: %d", currIndex) - } - lastMetadataIndex = currIndex - } + sortedSnapshotMetadatas, snapshotMetadataErrorsWithPaths, err := m.sortedSnapshotMetadataFiles() if len(sortedSnapshotMetadatas) == 0 { // No cleanup can be performed until we have at least one complete snapshot. @@ -411,6 +394,7 @@ func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseName } var ( + filePathPrefix = m.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() multiErr = xerrors.NewMultiError() filesToDelete = []string{} mostRecentSnapshot = sortedSnapshotMetadatas[len(sortedSnapshotMetadatas)-1] @@ -425,7 +409,7 @@ func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseName for _, ns := range namespaces { for _, s := range ns.OwnedShards() { - shardSnapshots, err := m.snapshotFilesFn(fsOpts.FilePathPrefix(), ns.ID(), s.ID()) + shardSnapshots, err := m.snapshotFilesFn(filePathPrefix, ns.ID(), s.ID()) if err != nil { multiErr = multiErr.Add(fmt.Errorf("err reading snapshot files for ns: %s and shard: %d, err: %v", ns.ID(), s.ID(), err)) continue @@ -439,7 +423,7 @@ func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseName // have no impact on correctness as the snapshot files from previous (successful) snapshot will still be // retained. m.metrics.corruptSnapshotFile.Inc(1) - logger.With( + m.logger.With( zap.Error(err), zap.Strings("files", snapshot.AbsoluteFilePaths), ).Warn("corrupt snapshot file during cleanup, marking files for deletion") @@ -466,7 +450,7 @@ func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseName // Delete corrupt snapshot metadata files. for _, errorWithPath := range snapshotMetadataErrorsWithPaths { m.metrics.corruptSnapshotMetadataFile.Inc(1) - logger.With( + m.logger.With( zap.Error(errorWithPath.Error), zap.String("metadataFilePath", errorWithPath.MetadataFilePath), zap.String("checkpointFilePath", errorWithPath.CheckpointFilePath), @@ -517,7 +501,7 @@ func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseName // If we were unable to read the commit log files info header, then we're forced to assume // that the file is corrupt and remove it. This can happen in situations where M3DB experiences // sudden shutdown. - logger.With( + m.logger.With( zap.Error(errorWithPath), zap.String("path", errorWithPath.Path()), ).Warn("corrupt commitlog file during cleanup, marking file for deletion") @@ -526,3 +510,121 @@ func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseName return finalErr } + +// cleanupIndexSnapshots is decoupled from the cleanup data snapshots and commit logs logic. Index snapshotting and +// data snapshotting happen at the same time and share the same snapshot metadata. However, we don't use snapshot metadata +// to determine whether or not to cleanup index snapshot files from disk. We apply the following logic: +// +// 1. Get a snapshot of all index block states. +// 2. Get all index snapshot files. +// 3. Remove index snapshots (on a per block start basis) up to either the loaded snapshot version +// for that block start or the flushed snapshot version (latest). +// +// We do so to ensure that we are not deleting index snapshots from disk while they are still loaded. Cleanup of commit logs +// still happens in the cleanup data snapshots path. Since index and data snapshots share the same rotated commitlog identifier, +// this work only needs to happen there once. +func (m *cleanupManager) cleanupIndexSnapshots(namespaces []databaseNamespace) error { + sortedSnapshotMetadatas, _, err := m.sortedSnapshotMetadataFiles() + if err != nil { + return err + } + + if len(sortedSnapshotMetadatas) == 0 { + // No cleanup can be performed until we have at least one complete snapshot. + return nil + } + + var ( + filePathPrefix = m.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix() + multiErr = xerrors.NewMultiError() + filesToDelete = []string{} + mostRecentSnapshot = sortedSnapshotMetadatas[len(sortedSnapshotMetadatas)-1] + ) + + for _, ns := range namespaces { + if !ns.Options().IndexOptions().Enabled() { + continue + } + + i, err := ns.Index() + if err != nil { + multiErr = multiErr.Add(err) + continue + } + blockStates, ok := i.BlockStatesSnapshot().UnwrapValue() + if !ok { + // Should not happen as cleanup manager does not run concurrently w/ bootstrapper but We do not look + // at un-bootstrapped indices. + continue + } + // Get index snapshot files and cross-ref block states. + snapshots, err := m.indexSnapshotFilesFn(filePathPrefix, ns.ID()) + for _, snapshot := range snapshots { + _, snapshotID, err := snapshot.SnapshotTimeAndID() + if err != nil { + // If we can't parse the snapshotID, assume the snapshot is corrupt and delete it. This could be caused + // by a variety of situations, like a node crashing while writing out a set of snapshot files and should + // have no impact on correctness as the snapshot files from previous (successful) snapshot will still be + // retained. + m.metrics.corruptSnapshotFile.Inc(1) + m.logger.With( + zap.Error(err), + zap.Strings("files", snapshot.AbsoluteFilePaths), + ).Warn("corrupt index snapshot file during cleanup, marking files for deletion") + filesToDelete = append(filesToDelete, snapshot.AbsoluteFilePaths...) + continue + } + + // We either remove up to the loaded snapshot version or everything but the most recent snapshot. + if blockState, ok := blockStates.Snapshot[xtime.ToUnixNano(snapshot.ID.BlockStart)]; ok && blockState.SnapshotVersionLoaded != snapshotVersionUnset { + if snapshot.ID.VolumeIndex < blockState.SnapshotVersionLoaded { + m.metrics.deletedSnapshotFile.Inc(1) + filesToDelete = append(filesToDelete, snapshot.AbsoluteFilePaths...) + } + continue + } + if !uuid.Equal(snapshotID, mostRecentSnapshot.ID.UUID) { + // If the UUID of the snapshot files doesn't match the most recent snapshot + // then its safe to delete because it means we have a more recently complete set. + m.metrics.deletedSnapshotFile.Inc(1) + filesToDelete = append(filesToDelete, snapshot.AbsoluteFilePaths...) + } + } + } + + multiErr = multiErr.Add(m.deleteFilesFn(filesToDelete)) + + return multiErr.FinalError() +} + +func (m *cleanupManager) sortedSnapshotMetadataFiles() ( + []fs.SnapshotMetadata, + []fs.SnapshotMetadataErrorWithPaths, + error, +) { + fsOpts := m.opts.CommitLogOptions().FilesystemOptions() + snapshotMetadatas, snapshotMetadataErrorsWithPaths, err := m.snapshotMetadataFilesFn(fsOpts) + if err != nil { + return nil, nil, err + } + + // They should technically already be sorted, but better to be safe. + sort.Slice(snapshotMetadatas, func(i, j int) bool { + return snapshotMetadatas[i].ID.Index < snapshotMetadatas[j].ID.Index + }) + sortedSnapshotMetadatas := snapshotMetadatas + + // Sanity check. + lastMetadataIndex := int64(-1) + for _, snapshotMetadata := range sortedSnapshotMetadatas { + currIndex := snapshotMetadata.ID.Index + if currIndex == lastMetadataIndex { + // Should never happen. + return nil, nil, fmt.Errorf( + "found two snapshot metadata files with duplicate index: %d", currIndex) + } + lastMetadataIndex = currIndex + } + + return sortedSnapshotMetadatas, snapshotMetadataErrorsWithPaths, nil +} diff --git a/src/dbnode/storage/cleanup_test.go b/src/dbnode/storage/cleanup_test.go index acc15dc251..a02ce24c3e 100644 --- a/src/dbnode/storage/cleanup_test.go +++ b/src/dbnode/storage/cleanup_test.go @@ -32,9 +32,11 @@ import ( "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/persist/fs/commitlog" "github.com/m3db/m3/src/dbnode/retention" + "github.com/m3db/m3/src/dbnode/storage/index" xerrors "github.com/m3db/m3/src/x/errors" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" + xtime "github.com/m3db/m3/src/x/time" "github.com/golang/mock/gomock" "github.com/pborman/uuid" @@ -51,13 +53,17 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() - testBlockStart := time.Now().Truncate(2 * time.Hour) + testBlockSize := 2 * time.Hour + testBlockStart := time.Now().Truncate(testBlockSize) testSnapshotUUID0 := uuid.Parse("a6367b49-9c83-4706-bd5c-400a4a9ec77c") require.NotNil(t, testSnapshotUUID0) testSnapshotUUID1 := uuid.Parse("bed2156f-182a-47ea-83ff-0a55d34c8a82") require.NotNil(t, testSnapshotUUID1) + testSnapshotUUID2 := uuid.Parse("d5582205-abea-4ec2-9c73-4a22535c1fff") + require.NotNil(t, testSnapshotUUID2) + testCommitlogFileIdentifier := persist.CommitLogFile{ FilePath: "commitlog-filepath-1", Index: 1, @@ -87,7 +93,11 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { title string snapshotMetadata snapshotMetadataFilesFn commitlogs commitLogFilesFn - snapshots snapshotFilesFn + snapshots fs.SnapshotFilesFn + indexSnapshots fs.IndexSnapshotFilesFn + indexBootstrapped bool + indexEnabled bool + indexBlockStates index.BootstrappedBlockStateSnapshot expectedDeletedFiles []string expectErr bool }{ @@ -271,6 +281,177 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { expectedDeletedFiles: []string{"corrupt-commitlog-file-0", "corrupt-commitlog-file-1"}, expectErr: true, }, + { + title: "Deletes index snapshot files for block starts up to loaded version", + snapshotMetadata: func(fs.Options) ([]fs.SnapshotMetadata, []fs.SnapshotMetadataErrorWithPaths, error) { + return []fs.SnapshotMetadata{testSnapshotMetadata0}, nil, nil + }, + // Not testing data snapshot and commit log cleanup. + snapshots: func(filePathPrefix string, namespace ident.ID, shard uint32) (fs.FileSetFilesSlice, error) { + return nil, nil + }, + commitlogs: func(commitlog.Options) (persist.CommitLogFiles, []commitlog.ErrorWithPath, error) { + return nil, nil, nil + }, + indexSnapshots: func(filePathPrefix string, namespace ident.ID) (fs.FileSetFilesSlice, error) { + return fs.FileSetFilesSlice{ + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 0, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-0", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID0, + }, + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 1, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-1", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID1, + }, + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 2, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-2", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID2, + }, + }, nil + }, + indexBlockStates: index.BootstrappedBlockStateSnapshot{ + Snapshot: map[xtime.UnixNano]index.BlockState{ + xtime.ToUnixNano(testBlockStart): { + SnapshotVersionLoaded: 1, + }, + }, + }, + indexBootstrapped: true, + indexEnabled: true, + expectedDeletedFiles: []string{ + "/index_snapshots/ns0/snapshot-filepath-0", + "/index_snapshots/ns1/snapshot-filepath-0", + "/index_snapshots/ns2/snapshot-filepath-0", + }, + }, + { + title: "Deletes index snapshot files for block starts up to most recent snapshot UUID", + snapshotMetadata: func(fs.Options) ([]fs.SnapshotMetadata, []fs.SnapshotMetadataErrorWithPaths, error) { + return []fs.SnapshotMetadata{testSnapshotMetadata1}, nil, nil + }, + // Not testing data snapshot and commit log cleanup. + snapshots: func(filePathPrefix string, namespace ident.ID, shard uint32) (fs.FileSetFilesSlice, error) { + return nil, nil + }, + commitlogs: func(commitlog.Options) (persist.CommitLogFiles, []commitlog.ErrorWithPath, error) { + return nil, nil, nil + }, + indexSnapshots: func(filePathPrefix string, namespace ident.ID) (fs.FileSetFilesSlice, error) { + return fs.FileSetFilesSlice{ + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 0, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-0", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID0, + }, + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 1, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-1", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID1, + }, + }, nil + }, + indexBlockStates: index.BootstrappedBlockStateSnapshot{ + Snapshot: map[xtime.UnixNano]index.BlockState{ + xtime.ToUnixNano(testBlockStart): { + SnapshotVersionLoaded: -1, + }, + }, + }, + indexBootstrapped: true, + indexEnabled: true, + expectedDeletedFiles: []string{ + "/index_snapshots/ns0/snapshot-filepath-0", + "/index_snapshots/ns1/snapshot-filepath-0", + "/index_snapshots/ns2/snapshot-filepath-0", + }, + }, + { + title: "Does not delete index snapshot files since loaded version takes priority over most recent snapshot UUID", + snapshotMetadata: func(fs.Options) ([]fs.SnapshotMetadata, []fs.SnapshotMetadataErrorWithPaths, error) { + return []fs.SnapshotMetadata{testSnapshotMetadata1}, nil, nil + }, + // Not testing data snapshot and commit log cleanup. + snapshots: func(filePathPrefix string, namespace ident.ID, shard uint32) (fs.FileSetFilesSlice, error) { + return nil, nil + }, + commitlogs: func(commitlog.Options) (persist.CommitLogFiles, []commitlog.ErrorWithPath, error) { + return nil, nil, nil + }, + indexSnapshots: func(filePathPrefix string, namespace ident.ID) (fs.FileSetFilesSlice, error) { + return fs.FileSetFilesSlice{ + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 0, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-0", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID0, + }, + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart, + VolumeIndex: 1, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-1", namespace)}, + CachedSnapshotTime: testBlockStart, + CachedSnapshotID: testSnapshotUUID1, + }, + { + ID: fs.FileSetFileIdentifier{ + Namespace: namespace, + BlockStart: testBlockStart.Add(testBlockSize), + VolumeIndex: 0, + }, + AbsoluteFilePaths: []string{fmt.Sprintf("/index_snapshots/%s/snapshot-filepath-0", namespace)}, + CachedSnapshotTime: testBlockStart.Add(testBlockSize), + CachedSnapshotID: testSnapshotUUID0, + }, + }, nil + }, + indexBlockStates: index.BootstrappedBlockStateSnapshot{ + Snapshot: map[xtime.UnixNano]index.BlockState{ + xtime.ToUnixNano(testBlockStart): { + SnapshotVersionLoaded: 0, + }, + xtime.ToUnixNano(testBlockStart.Add(testBlockSize)): { + SnapshotVersionLoaded: 0, + }, + }, + }, + indexBootstrapped: true, + indexEnabled: true, + }, } for _, tc := range testCases { @@ -279,7 +460,11 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { rOpts := retention.NewOptions(). SetRetentionPeriod(21600 * time.Second). SetBlockSize(7200 * time.Second) - nsOpts := namespace.NewOptions().SetRetentionOptions(rOpts) + nsOpts := namespace.NewOptions(). + SetRetentionOptions(rOpts). + SetIndexOptions(namespace.NewIndexOptions(). + SetEnabled(tc.indexEnabled). + SetBlockSize(7200 * time.Second)) namespaces := make([]databaseNamespace, 0, 3) shards := make([]databaseShard, 0, 3) @@ -298,6 +483,17 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { ns.EXPECT().Options().Return(nsOpts).AnyTimes() ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(false, nil).AnyTimes() ns.EXPECT().OwnedShards().Return(shards).AnyTimes() + + if tc.indexEnabled { + idx := NewMockNamespaceIndex(ctrl) + idx.EXPECT().BlockStatesSnapshot().Return(index.NewBlockStateSnapshot( + tc.indexBootstrapped, + tc.indexBlockStates, + )) + idx.EXPECT().CleanupExpiredFileSets(gomock.Any()) + idx.EXPECT().CleanupDuplicateFileSets() + ns.EXPECT().Index().Return(idx, nil).AnyTimes() + } namespaces = append(namespaces, ns) } @@ -311,6 +507,7 @@ func TestCleanupManagerCleanupCommitlogsAndSnapshots(t *testing.T) { mgr.snapshotMetadataFilesFn = tc.snapshotMetadata mgr.commitLogFilesFn = tc.commitlogs mgr.snapshotFilesFn = tc.snapshots + mgr.indexSnapshotFilesFn = tc.indexSnapshots var deletedFiles []string mgr.deleteFilesFn = func(files []string) error { @@ -431,10 +628,18 @@ func TestCleanupDataAndSnapshotFileSetFiles(t *testing.T) { defer ctrl.Finish() ts := timeFor(36000) - nsOpts := namespaceOptions + nsOpts := namespaceOptions. + SetIndexOptions(namespace.NewIndexOptions(). + SetEnabled(true). + SetBlockSize(7200 * time.Second)) ns := NewMockdatabaseNamespace(ctrl) ns.EXPECT().Options().Return(nsOpts).AnyTimes() + idx := NewMockNamespaceIndex(ctrl) + idx.EXPECT().CleanupExpiredFileSets(gomock.Any()) + idx.EXPECT().CleanupDuplicateFileSets() + ns.EXPECT().Index().Return(idx, nil).AnyTimes() + shard := NewMockdatabaseShard(ctrl) expectedEarliestToRetain := retention.FlushTimeStart(ns.Options().RetentionOptions(), ts) shard.EXPECT().CleanupExpiredFileSets(expectedEarliestToRetain).Return(nil) diff --git a/src/dbnode/storage/flush.go b/src/dbnode/storage/flush.go index 13b04d757b..27396d0fe5 100644 --- a/src/dbnode/storage/flush.go +++ b/src/dbnode/storage/flush.go @@ -28,6 +28,7 @@ import ( "github.com/m3db/m3/src/dbnode/clock" "github.com/m3db/m3/src/dbnode/persist" + "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/persist/fs/commitlog" "github.com/m3db/m3/src/dbnode/retention" xerrors "github.com/m3db/m3/src/x/errors" @@ -63,7 +64,7 @@ type flushManagerMetrics struct { // is not overly aggressive. maxBlocksSnapshottedByNamespace tally.Gauge dataWarmFlushDuration tally.Timer - dataSnapshotDuration tally.Timer + dataAndIndexSnapshotDuration tally.Timer indexFlushDuration tally.Timer commitLogRotationDuration tally.Timer } @@ -75,7 +76,7 @@ func newFlushManagerMetrics(scope tally.Scope) flushManagerMetrics { isIndexFlushing: scope.Gauge("index-flush"), maxBlocksSnapshottedByNamespace: scope.Gauge("max-blocks-snapshotted-by-namespace"), dataWarmFlushDuration: scope.Timer("data-warm-flush-duration"), - dataSnapshotDuration: scope.Timer("data-snapshot-duration"), + dataAndIndexSnapshotDuration: scope.Timer("data-and-index-snapshot-duration"), indexFlushDuration: scope.Timer("index-flush-duration"), commitLogRotationDuration: scope.Timer("commit-log-rotation-duration"), } @@ -96,8 +97,9 @@ type flushManager struct { lastSuccessfulSnapshotStartTime atomic.Int64 // == xtime.UnixNano - logger *zap.Logger - nowFn clock.NowFn + logger *zap.Logger + nowFn clock.NowFn + readIndexInfoFilesFn fs.ReadIndexInfoFilesFn } func newFlushManager( @@ -107,13 +109,14 @@ func newFlushManager( ) databaseFlushManager { opts := database.Options() return &flushManager{ - database: database, - commitlog: commitlog, - opts: opts, - pm: opts.PersistManager(), - metrics: newFlushManagerMetrics(scope), - logger: opts.InstrumentOptions().Logger(), - nowFn: opts.ClockOptions().NowFn(), + database: database, + commitlog: commitlog, + opts: opts, + pm: opts.PersistManager(), + metrics: newFlushManagerMetrics(scope), + logger: opts.InstrumentOptions().Logger(), + nowFn: opts.ClockOptions().NowFn(), + readIndexInfoFilesFn: fs.ReadIndexInfoFiles, } } @@ -152,15 +155,19 @@ func (m *flushManager) Flush(startTime time.Time) error { start := m.nowFn() rotatedCommitlogID, err := m.commitlog.RotateLogs() m.metrics.commitLogRotationDuration.Record(m.nowFn().Sub(start)) - if err == nil { - if err = m.dataSnapshot(namespaces, startTime, rotatedCommitlogID); err != nil { + rotateCommitlogSuccess := err == nil + // We want to use the same snapshot ID across all both data/index snapshotting and + // writing of empty snapshots to disk after a successful index flush. + snapshotID := uuid.NewUUID() + if rotateCommitlogSuccess { + if err = m.dataAndIndexSnapshot(namespaces, startTime, rotatedCommitlogID, snapshotID); err != nil { multiErr = multiErr.Add(err) } } else { multiErr = multiErr.Add(fmt.Errorf("error rotating commitlog in mediator tick: %v", err)) } - if err = m.indexFlush(namespaces); err != nil { + if err = m.indexFlush(namespaces, rotatedCommitlogID, snapshotID); err != nil { multiErr = multiErr.Add(err) } @@ -203,18 +210,16 @@ func (m *flushManager) dataWarmFlush( return multiErr.FinalError() } -func (m *flushManager) dataSnapshot( +func (m *flushManager) dataAndIndexSnapshot( namespaces []databaseNamespace, startTime time.Time, rotatedCommitlogID persist.CommitLogFile, + snapshotID uuid.UUID, ) error { - snapshotID := uuid.NewUUID() - snapshotPersist, err := m.pm.StartSnapshotPersist(snapshotID) if err != nil { return err } - m.setState(flushManagerSnapshotInProgress) var ( start = m.nowFn() @@ -222,6 +227,17 @@ func (m *flushManager) dataSnapshot( multiErr = xerrors.NewMultiError() ) for _, ns := range namespaces { + // NB(bodu): Read in index info files and pass them in for determining + // whether or not snapshots are warm or cold. We do this here so we're not + // doing dupe work per block start. + fsOpts := m.opts.CommitLogOptions().FilesystemOptions() + infoFiles := m.readIndexInfoFilesFn( + fsOpts.FilePathPrefix(), + ns.ID(), + fsOpts.InfoReaderBufferSize(), + persist.FileSetFlushType, + ) + snapshotBlockStarts, err := m.namespaceSnapshotTimes(ns, startTime) if err != nil { detailedErr := fmt.Errorf( @@ -236,7 +252,11 @@ func (m *flushManager) dataSnapshot( } for _, snapshotBlockStart := range snapshotBlockStarts { err := ns.Snapshot( - snapshotBlockStart, startTime, snapshotPersist) + snapshotBlockStart, + startTime, + snapshotPersist, + infoFiles, + ) if err != nil { detailedErr := fmt.Errorf( @@ -256,12 +276,14 @@ func (m *flushManager) dataSnapshot( if finalErr == nil { m.lastSuccessfulSnapshotStartTime.Store(int64(xtime.ToUnixNano(startTime))) } - m.metrics.dataSnapshotDuration.Record(m.nowFn().Sub(start)) + m.metrics.dataAndIndexSnapshotDuration.Record(m.nowFn().Sub(start)) return finalErr } func (m *flushManager) indexFlush( namespaces []databaseNamespace, + rotatedCommitlogID persist.CommitLogFile, + snapshotID uuid.UUID, ) error { indexFlush, err := m.pm.StartIndexPersist() if err != nil { diff --git a/src/dbnode/storage/flush_test.go b/src/dbnode/storage/flush_test.go index 73fa37ca2e..417a1c71df 100644 --- a/src/dbnode/storage/flush_test.go +++ b/src/dbnode/storage/flush_test.go @@ -148,6 +148,9 @@ func TestFlushManagerFlushAlreadyInProgress(t *testing.T) { // Ensure it doesn't allow a parallel flush. require.Equal(t, errFlushOperationsInProgress, fm.Flush(now)) + // We start snapshot persist twice, once in the regular data/index snapshot path + // and another time after a warm index flush (to write empty snapshots to disk). + doneCh <- struct{}{} doneCh <- struct{}{} }() @@ -170,8 +173,8 @@ func TestFlushManagerFlushDoneFlushError(t *testing.T) { mockFlushPersist.EXPECT().DoneFlush().Return(fakeErr) mockPersistManager.EXPECT().StartFlushPersist().Return(mockFlushPersist, nil) - mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil) - mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil) + mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil).AnyTimes() + mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil).AnyTimes() mockIndexFlusher := persist.NewMockIndexFlush(ctrl) mockIndexFlusher.EXPECT().DoneIndex().Return(nil) @@ -209,8 +212,8 @@ func TestFlushManagerNamespaceFlushTimesErr(t *testing.T) { mockFlushPersist.EXPECT().DoneFlush().Return(nil) mockPersistManager.EXPECT().StartFlushPersist().Return(mockFlushPersist, nil) - mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil) - mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil) + mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil).AnyTimes() + mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil).AnyTimes() mockIndexFlusher := persist.NewMockIndexFlush(ctrl) mockIndexFlusher.EXPECT().DoneIndex().Return(nil) @@ -225,7 +228,7 @@ func TestFlushManagerNamespaceFlushTimesErr(t *testing.T) { ns.EXPECT().Options().Return(nsOpts).AnyTimes() ns.EXPECT().ID().Return(defaultTestNs1ID).AnyTimes() ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(false, fakeErr).AnyTimes() - ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() db.EXPECT().OwnedNamespaces().Return([]databaseNamespace{ns}, nil) cl := commitlog.NewMockCommitLog(ctrl) @@ -254,8 +257,8 @@ func TestFlushManagerFlushDoneSnapshotError(t *testing.T) { mockFlushPersist.EXPECT().DoneFlush().Return(nil) mockPersistManager.EXPECT().StartFlushPersist().Return(mockFlushPersist, nil) - mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(fakeErr) - mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil) + mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(fakeErr).AnyTimes() + mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil).AnyTimes() mockIndexFlusher := persist.NewMockIndexFlush(ctrl) mockIndexFlusher.EXPECT().DoneIndex().Return(nil) @@ -289,8 +292,8 @@ func TestFlushManagerFlushDoneIndexError(t *testing.T) { mockFlushPersist.EXPECT().DoneFlush().Return(nil) mockPersistManager.EXPECT().StartFlushPersist().Return(mockFlushPersist, nil) - mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil) - mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil) + mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil).AnyTimes() + mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil).AnyTimes() fakeErr := errors.New("fake error while marking flush done") mockIndexFlusher := persist.NewMockIndexFlush(ctrl) @@ -322,7 +325,7 @@ func TestFlushManagerSkipNamespaceIndexingDisabled(t *testing.T) { ns.EXPECT().ID().Return(defaultTestNs1ID).AnyTimes() ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(true, nil).AnyTimes() ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() var ( mockFlushPersist = persist.NewMockFlushPreparer(ctrl) @@ -333,8 +336,8 @@ func TestFlushManagerSkipNamespaceIndexingDisabled(t *testing.T) { mockFlushPersist.EXPECT().DoneFlush().Return(nil) mockPersistManager.EXPECT().StartFlushPersist().Return(mockFlushPersist, nil) - mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil) - mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil) + mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil).AnyTimes() + mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil).AnyTimes() mockIndexFlusher := persist.NewMockIndexFlush(ctrl) mockIndexFlusher.EXPECT().DoneIndex().Return(nil) @@ -365,7 +368,12 @@ func TestFlushManagerNamespaceIndexingEnabled(t *testing.T) { ns.EXPECT().ID().Return(defaultTestNs1ID).AnyTimes() ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(true, nil).AnyTimes() ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + + shard := NewMockdatabaseShard(ctrl) + shard.EXPECT().ID().Return(uint32(0)).AnyTimes() + ns.EXPECT().OwnedShards().Return([]databaseShard{shard}).AnyTimes() + ns.EXPECT().FlushIndex(gomock.Any()).Return(nil) var ( @@ -377,8 +385,8 @@ func TestFlushManagerNamespaceIndexingEnabled(t *testing.T) { mockFlushPersist.EXPECT().DoneFlush().Return(nil) mockPersistManager.EXPECT().StartFlushPersist().Return(mockFlushPersist, nil) - mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil) - mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil) + mockSnapshotPersist.EXPECT().DoneSnapshot(gomock.Any(), testCommitlogFile).Return(nil).AnyTimes() + mockPersistManager.EXPECT().StartSnapshotPersist(gomock.Any()).Return(mockSnapshotPersist, nil).AnyTimes() mockIndexFlusher := persist.NewMockIndexFlush(ctrl) mockIndexFlusher.EXPECT().DoneIndex().Return(nil) @@ -547,7 +555,7 @@ func TestFlushManagerFlushSnapshot(t *testing.T) { num = numIntervals(start, snapshotEnd, blockSize) for i := 0; i < num; i++ { st := start.Add(time.Duration(i) * blockSize) - ns.EXPECT().Snapshot(st, now, gomock.Any()) + ns.EXPECT().Snapshot(st, now, gomock.Any(), gomock.Any()) } } diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go index 1aa73d162f..f0c89c3ebb 100644 --- a/src/dbnode/storage/index.go +++ b/src/dbnode/storage/index.go @@ -52,6 +52,7 @@ import ( m3ninxindex "github.com/m3db/m3/src/m3ninx/index" "github.com/m3db/m3/src/m3ninx/index/segment" "github.com/m3db/m3/src/m3ninx/index/segment/builder" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" idxpersist "github.com/m3db/m3/src/m3ninx/persist" xclose "github.com/m3db/m3/src/x/close" "github.com/m3db/m3/src/x/context" @@ -86,15 +87,34 @@ const ( nsIndexReportStatsInterval = 10 * time.Second defaultFlushDocsBatchSize = 8192 + + // Use -1 for unset snapshot versions. + snapshotVersionUnset = -1 ) var ( allQuery = idx.NewAllQuery() ) +type snapshotState struct { + sync.RWMutex + statesByTime map[xtime.UnixNano]index.BlockState + + // segmentsData is used to amortize allocs across the entire index + // snapshotting workload + segmentsData []fst.SegmentData +} + +func newSnapshotState() snapshotState { + return snapshotState{ + statesByTime: make(map[xtime.UnixNano]index.BlockState), + } +} + // nolint: maligned type nsIndex struct { - state nsIndexState + state nsIndexState + snapshotState snapshotState extendedRetentionPeriod time.Duration @@ -111,7 +131,7 @@ type nsIndex struct { namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager indexFilesetsBeforeFn indexFilesetsBeforeFn deleteFilesFn deleteFilesFn - readIndexInfoFilesFn readIndexInfoFilesFn + readIndexInfoFilesFn fs.ReadIndexInfoFilesFn newBlockFn index.NewBlockFn logger *zap.Logger @@ -194,11 +214,6 @@ type indexFilesetsBeforeFn func(dir string, exclusiveTime time.Time, ) ([]string, error) -type readIndexInfoFilesFn func(filePathPrefix string, - namespace ident.ID, - readerBufferSize int, -) []fs.ReadIndexInfoFileResult - type newNamespaceIndexOpts struct { md namespace.Metadata namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager @@ -327,6 +342,7 @@ func newNamespaceIndexWithOptions( blocksByTime: make(map[xtime.UnixNano]index.Block), shardsAssigned: make(map[uint32]struct{}), }, + snapshotState: newSnapshotState(), nowFn: nowFn, blockSize: nsMD.Options().IndexOptions().BlockSize(), @@ -844,16 +860,44 @@ func (i *nsIndex) Bootstrap( i.state.Unlock() }() - var multiErr xerrors.MultiError + var ( + multiErr xerrors.MultiError + fsOpts = i.opts.CommitLogOptions().FilesystemOptions() + infoFiles = i.readIndexInfoFilesFn( + fsOpts.FilePathPrefix(), + i.nsMetadata.ID(), + fsOpts.InfoReaderBufferSize(), + persist.FileSetFlushType, + ) + ) for blockStart, blockResults := range bootstrapResults { block, err := i.ensureBlockPresentWithRLock(blockStart.ToTime()) if err != nil { // should never happen multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err)) continue } + // NB(bodu): For warm snapshots, we need to make sure that we haven't already successfully warm + // flushed this block. We can run into this case when the node crashes between a successful warm + // flush and the next index snapshot. + if _, ok := blockResults.GetBlock(idxpersist.SnapshotWarmIndexVolumeType); ok { + if block.IsSealed() && i.hasIndexWarmFlushedToDisk(infoFiles, blockStart.ToTime()) { + // If we have warm snapshots and the block has been warm flushed already, + // we just discard the warm snapshot data. + blockResults.DeleteBlock(idxpersist.SnapshotWarmIndexVolumeType) + } + } if err := block.AddResults(blockResults); err != nil { multiErr = multiErr.Add(err) } + for volumeType, results := range blockResults.Iter() { + switch volumeType { + case idxpersist.SnapshotColdIndexVolumeType, idxpersist.SnapshotWarmIndexVolumeType: + // Only set if the volume index in the results is set. + if volumeIndex := results.VolumeIndex(); volumeIndex >= 0 { + i.setSnapshotStateVersionLoaded(blockStart.ToTime(), volumeIndex) + } + } + } } return multiErr.FinalError() @@ -867,12 +911,21 @@ func (i *nsIndex) Bootstrapped() bool { } func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceIndexTickResult, error) { - var result namespaceIndexTickResult - i.state.Lock() + var ( + result namespaceIndexTickResult + deletedBlockStarts = make([]xtime.UnixNano, 0, len(i.state.blocksByTime)) + ) defer func() { i.updateBlockStartsWithLock() i.state.Unlock() + // Delete snapshot states after holding lock on state to avoid + // a lock dependency. + i.snapshotState.Lock() + for _, blockStart := range deletedBlockStarts { + delete(i.snapshotState.statesByTime, blockStart) + } + i.snapshotState.Unlock() }() earliestBlockStartToRetain := i.earliestBlockStartToRetainWithLock(startTime) @@ -890,6 +943,7 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceInd if blockStart.ToTime().Before(earliestBlockStartToRetain) { multiErr = multiErr.Add(block.Close()) delete(i.state.blocksByTime, blockStart) + deletedBlockStarts = append(deletedBlockStarts, blockStart) result.NumBlocksEvicted++ result.NumBlocks-- continue @@ -948,7 +1002,9 @@ func (i *nsIndex) WarmFlush( i.metrics.flushIndexingConcurrency.Update(float64(concurrency)) defer i.metrics.flushIndexingConcurrency.Update(0) - var evicted int + var ( + evicted int + ) for _, block := range flushable { immutableSegments, err := i.flushBlock(flush, block, shards, builder) if err != nil { @@ -984,6 +1040,9 @@ func (i *nsIndex) WarmFlush( zap.Time("blockStart", block.StartTime()), ) } + + // NB(bodu): We should reset any snapshot loaded version to default after a successful warm flush. + i.setSnapshotStateVersionLoaded(block.StartTime(), snapshotVersionUnset) } i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted)) return nil @@ -1009,6 +1068,8 @@ func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) { multiErr := xerrors.NewMultiError() for _, block := range flushable { multiErr = multiErr.Add(block.EvictColdMutableSegments()) + // NB(bodu): We should reset any snapshot loaded version to default after a successful cold flush. + i.setSnapshotStateVersionLoaded(block.StartTime(), snapshotVersionUnset) } return multiErr.FinalError() }, nil @@ -1030,6 +1091,7 @@ func (i *nsIndex) flushableBlocks( fsOpts.FilePathPrefix(), i.nsMetadata.ID(), fsOpts.InfoReaderBufferSize(), + persist.FileSetFlushType, ) flushable := make([]index.Block, 0, len(i.state.blocksByTime)) @@ -1069,9 +1131,10 @@ func (i *nsIndex) canFlushBlockWithRLock( case series.WarmWrite: // NB(bodu): We should always attempt to warm flush sealed blocks to disk if // there doesn't already exist data on disk. We're checking this instead of - // `block.NeedsMutableSegmentsEvicted()` since bootstrap writes for cold block starts - // get marked as warm writes if there doesn't already exist data on disk and need to - // properly go through the warm flush lifecycle. + // if the index block has mutable segments that we can evict + // (original check implemented in `block.NeedsMutableSegmentsEvicted()`) since bootstrap writes for + // cold block starts get marked as warm writes if there doesn't already exist data + // on disk so they need to properly go through the warm flush lifecycle. if !block.IsSealed() || i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) { return false, nil } @@ -1132,7 +1195,7 @@ func (i *nsIndex) flushBlock( allShards[shard.ID()] = struct{}{} } - preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{ + preparedPersist, err := flush.PrepareIndexFlush(persist.IndexPrepareOptions{ NamespaceMetadata: i.nsMetadata, BlockStart: indexBlock.StartTime(), FileSetType: persist.FileSetFlushType, @@ -1162,12 +1225,11 @@ func (i *nsIndex) flushBlock( closed = true - // Now return the immutable segments return preparedPersist.Close() } func (i *nsIndex) flushBlockSegment( - preparedPersist persist.PreparedIndexPersist, + preparedPersist persist.PreparedIndexFlushPersist, indexBlock index.Block, shards []databaseShard, builder segment.DocumentsBuilder, @@ -1262,6 +1324,80 @@ func (i *nsIndex) flushBlockSegment( return preparedPersist.Persist(builder) } +func (i *nsIndex) Snapshot( + shards map[uint32]struct{}, + blockStart, + snapshotTime time.Time, + snapshotPersist persist.SnapshotPreparer, + infoFiles []fs.ReadIndexInfoFileResult, +) error { + i.state.RLock() + if i.state.closed { + i.state.RUnlock() + return errDbIndexAlreadyClosed + } + // Blocks are removed during ticks once they are out of retention, this means + // that we may snapshot data that's out of retention which is fine. + block, ok := i.state.blocksByTime[xtime.ToUnixNano(blockStart)] + if !ok { + // Do nothing if there is no index block to snapshot. + i.state.RUnlock() + return nil + } + i.state.RUnlock() + + if block.NumSegments() == 0 { + // Do nothing if no index segments to snapshot. + return nil + } + + // NB(bodu): There is a time window between when a block is sealed and when it is + // flushed that we are accumulating data in cold segments but we will snapshot to disk as + // warm segments. This is fine since we have not yet warm flushed this index block and loading + // all snapshotted segments into as warm mutable segments will place them in the correct lifecycle. + indexVolumeType := idxpersist.SnapshotWarmIndexVolumeType + // NB(bodu): If a block is sealed && there is flushed data on disk, + // then we're writing data into cold segments only. + if block.IsSealed() && i.hasIndexWarmFlushedToDisk(infoFiles, block.StartTime()) { + indexVolumeType = idxpersist.SnapshotColdIndexVolumeType + } + + prepareOpts := persist.IndexPrepareSnapshotOptions{ + IndexPrepareOptions: persist.IndexPrepareOptions{ + NamespaceMetadata: i.nsMetadata, + BlockStart: blockStart, + FileSetType: persist.FileSetSnapshotType, + Shards: shards, + IndexVolumeType: indexVolumeType, + }, + SnapshotTime: snapshotTime, + } + prepared, err := snapshotPersist.PrepareIndexSnapshot(prepareOpts) + if err != nil { + return err + } + + ctx := context.NewContext() + defer ctx.Close() + + // NB(bodu): Although snapshotting currently happens in a single thread but lock + // on the resusable segments data resource to be safe. + i.snapshotState.Lock() + defer i.snapshotState.Unlock() + i.snapshotState.segmentsData = i.snapshotState.segmentsData[:0] + i.snapshotState.segmentsData, err = block.AppendMemorySegmentsData(ctx, i.snapshotState.segmentsData) + if err != nil { + return err + } + for _, segmentData := range i.snapshotState.segmentsData { + if err := prepared.Persist(segmentData); err != nil { + return err + } + } + + return prepared.Close() +} + func (i *nsIndex) sanitizeAllowDuplicatesWriteError(err error) error { if err == nil { return nil @@ -1905,7 +2041,7 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (index.Block } // NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent. - // We need to seal cold blocks write away for cold writes. + // We need to seal cold blocks right away for cold writes. if !blockStart.After(i.lastSealableBlockStart(i.nowFn())) { if err := block.Seal(); err != nil { return nil, err @@ -1994,6 +2130,7 @@ func (i *nsIndex) CleanupDuplicateFileSets() error { fsOpts.FilePathPrefix(), i.nsMetadata.ID(), fsOpts.InfoReaderBufferSize(), + persist.FileSetFlushType, ) segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart := make(map[xtime.UnixNano]map[idxpersist.IndexVolumeType][]fs.Segments) @@ -2063,13 +2200,20 @@ func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error { FilesystemOptions(). SetFilePathPrefix(opts.OutputDirectory) + segDataWriter, err := idxpersist.NewFSTSegmentDataFileSetWriter() + if err != nil { + return err + } + + var results []fst.SegmentData for _, block := range i.state.blocksByTime { - segmentsData, err := block.MemorySegmentsData(ctx) + results = results[:0] + results, err = block.AppendMemorySegmentsData(ctx, results) if err != nil { return err } - for numSegment, segmentData := range segmentsData { + for numSegment, segmentData := range results { indexWriter, err := fs.NewIndexWriter(fsOpts) if err != nil { return err @@ -2092,12 +2236,11 @@ func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error { return err } - segWriter, err := idxpersist.NewFSTSegmentDataFileSetWriter(segmentData) - if err != nil { + if err := segDataWriter.Reset(segmentData); err != nil { return err } - if err := indexWriter.WriteSegmentFileSet(segWriter); err != nil { + if err := indexWriter.WriteSegmentFileSet(segDataWriter); err != nil { return err } @@ -2110,6 +2253,48 @@ func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error { return nil } +func (i *nsIndex) BlockStatesSnapshot() index.BlockStateSnapshot { + i.state.RLock() + bootstrapped := i.state.bootstrapState == Bootstrapped + i.state.RUnlock() + if !bootstrapped { + // Needs to be bootstrapped. + return index.NewBlockStateSnapshot(false, index.BootstrappedBlockStateSnapshot{}) + } + + i.snapshotState.Lock() + defer i.snapshotState.Unlock() + snapshot := make(map[xtime.UnixNano]index.BlockState, len(i.snapshotState.statesByTime)) + for time, state := range i.snapshotState.statesByTime { + snapshot[time] = state + } + + return index.NewBlockStateSnapshot(true, index.BootstrappedBlockStateSnapshot{ + Snapshot: snapshot, + }) +} + +func (i *nsIndex) setSnapshotStateVersionLoaded(blockStart time.Time, version int) { + i.snapshotState.Lock() + defer i.snapshotState.Unlock() + state := i.ensureSnapshotStateWithLock(blockStart) + state.SnapshotVersionLoaded = version + i.snapshotState.statesByTime[xtime.ToUnixNano(blockStart)] = state +} + +// ensureSnapshotStateWithLock gets snapshot state given a block start and ensures that it exists. +func (i *nsIndex) ensureSnapshotStateWithLock(blockStart time.Time) index.BlockState { + state, ok := i.snapshotState.statesByTime[xtime.ToUnixNano(blockStart)] + if !ok { + state = index.BlockState{ + // Unset values for snapshot version is -1. + SnapshotVersionLoaded: snapshotVersionUnset, + } + i.snapshotState.statesByTime[xtime.ToUnixNano(blockStart)] = state + } + return state +} + func (i *nsIndex) Close() error { i.state.Lock() if !i.isOpenWithRLock() { diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go index f1f0e80b9f..91279e0c97 100644 --- a/src/dbnode/storage/index/block.go +++ b/src/dbnode/storage/index/block.go @@ -37,6 +37,7 @@ import ( "github.com/m3db/m3/src/m3ninx/index/segment" "github.com/m3db/m3/src/m3ninx/index/segment/fst" "github.com/m3db/m3/src/m3ninx/persist" + idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/m3ninx/search" "github.com/m3db/m3/src/m3ninx/search/executor" "github.com/m3db/m3/src/x/context" @@ -338,10 +339,7 @@ func (b *block) executorWithRLock() (search.Executor, error) { } func (b *block) segmentReadersWithRLock() ([]segment.Reader, error) { - expectedReaders := b.mutableSegments.Len() - for _, coldSeg := range b.coldMutableSegments { - expectedReaders += coldSeg.Len() - } + expectedReaders := b.numSegmentsWithRLock() b.shardRangesSegmentsByVolumeType.forEachSegmentGroup(func(group blockShardRangesSegments) error { expectedReaders += len(group.segments) return nil @@ -833,18 +831,6 @@ func (b *block) AddResults( b.Lock() defer b.Unlock() - multiErr := xerrors.NewMultiError() - for volumeType, results := range resultsByVolumeType.Iter() { - multiErr = multiErr.Add(b.addResults(volumeType, results)) - } - - return multiErr.FinalError() -} - -func (b *block) addResults( - volumeType persist.IndexVolumeType, - results result.IndexBlock, -) error { // NB(prateek): we have to allow bootstrap to succeed even if we're Sealed because // of topology changes. i.e. if the current m3db process is assigned new shards, // we need to include their data in the index. @@ -854,14 +840,36 @@ func (b *block) addResults( return errUnableToBootstrapBlockClosed } - // First check fulfilled is correct - min, max := results.Fulfilled().MinMax() - if min.Before(b.blockStart) || max.After(b.blockEnd) { - blockRange := xtime.Range{Start: b.blockStart, End: b.blockEnd} - return fmt.Errorf("fulfilled range %s is outside of index block range: %s", - results.Fulfilled().SummaryString(), blockRange.String()) + multiErr := xerrors.NewMultiError() + for volumeType, results := range resultsByVolumeType.Iter() { + // First check fulfilled is correct + min, max := results.Fulfilled().MinMax() + if min.Before(b.blockStart) || max.After(b.blockEnd) { + blockRange := xtime.Range{Start: b.blockStart, End: b.blockEnd} + err := fmt.Errorf("fulfilled range %s is outside of index block range: %s", + results.Fulfilled().SummaryString(), blockRange.String()) + multiErr = multiErr.Add(err) + } + + switch volumeType { + case idxpersist.SnapshotColdIndexVolumeType: + // NB(bodu): There is always at least 1 cold mutable segment. + coldBlock := b.coldMutableSegments[len(b.coldMutableSegments)-1] + coldBlock.addOnDiskSegmentsWithoutLock(results.Segments()) + case idxpersist.SnapshotWarmIndexVolumeType: + b.mutableSegments.addOnDiskSegmentsWithoutLock(results.Segments()) + default: + multiErr = multiErr.Add(b.addResults(volumeType, results)) + } } + return multiErr.FinalError() +} + +func (b *block) addResults( + volumeType persist.IndexVolumeType, + results result.IndexBlock, +) error { shardRangesSegments, ok := b.shardRangesSegmentsByVolumeType[volumeType] if !ok { shardRangesSegments = make([]blockShardRangesSegments, 0) @@ -876,9 +884,9 @@ func (b *block) addResults( readThroughSegments := make([]segment.Segment, 0, len(segments)) for _, seg := range segments { elem := seg.Segment() - if immSeg, ok := elem.(segment.ImmutableSegment); ok { - // only wrap the immutable segments with a read through cache. - elem = NewReadThroughSegment(immSeg, plCache, readThroughOpts) + if fstSeg, ok := elem.(fst.Segment); ok { + // only wrap the fst segments with a read through cache. + elem = NewReadThroughSegment(fstSeg, plCache, readThroughOpts) } readThroughSegments = append(readThroughSegments, elem) } @@ -1020,24 +1028,6 @@ func (b *block) IsSealed() bool { return b.IsSealedWithRLock() } -func (b *block) NeedsMutableSegmentsEvicted() bool { - b.RLock() - defer b.RUnlock() - - // Check any mutable segments that can be evicted after a flush. - anyMutableSegmentNeedsEviction := b.mutableSegments.NeedsEviction() - - // Check boostrapped segments and to see if any of them need an eviction. - b.shardRangesSegmentsByVolumeType.forEachSegment(func(seg segment.Segment) error { - if mutableSeg, ok := seg.(segment.MutableSegment); ok { - anyMutableSegmentNeedsEviction = anyMutableSegmentNeedsEviction || mutableSeg.Size() > 0 - } - return nil - }) - - return anyMutableSegmentNeedsEviction -} - func (b *block) EvictMutableSegments() error { b.Lock() defer b.Unlock() @@ -1047,7 +1037,7 @@ func (b *block) EvictMutableSegments() error { b.mutableSegments.Close() - // Close any other mutable segments that was added. + // Close any other mutable segments that were added. multiErr := xerrors.NewMultiError() for _, shardRangesSegments := range b.shardRangesSegmentsByVolumeType { for idx := range shardRangesSegments { @@ -1110,24 +1100,41 @@ func (b *block) RotateColdMutableSegments() { )) } -func (b *block) MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) { +func (b *block) AppendMemorySegmentsData( + ctx context.Context, + results []fst.SegmentData, +) ([]fst.SegmentData, error) { b.RLock() defer b.RUnlock() + var err error if b.state == blockStateClosed { return nil, errBlockAlreadyClosed } - data, err := b.mutableSegments.MemorySegmentsData(ctx) + results, err = b.mutableSegments.AppendMemorySegmentsData(ctx, results) if err != nil { return nil, err } for _, coldSeg := range b.coldMutableSegments { - coldData, err := coldSeg.MemorySegmentsData(ctx) + results, err = coldSeg.AppendMemorySegmentsData(ctx, results) if err != nil { return nil, err } - data = append(data, coldData...) } - return data, nil + return results, nil +} + +func (b *block) NumSegments() int { + b.RLock() + defer b.RUnlock() + return b.numSegmentsWithRLock() +} + +func (b *block) numSegmentsWithRLock() int { + count := b.mutableSegments.Len() + for _, coldSeg := range b.coldMutableSegments { + count += coldSeg.Len() + } + return count } func (b *block) Close() error { diff --git a/src/dbnode/storage/index/block_test.go b/src/dbnode/storage/index/block_test.go index 5bb078b676..aa50a73db4 100644 --- a/src/dbnode/storage/index/block_test.go +++ b/src/dbnode/storage/index/block_test.go @@ -1207,76 +1207,6 @@ func TestBlockAddResultsDoesNotCoverCurrentData(t *testing.T) { require.NoError(t, b.Close()) } -func TestBlockNeedsMutableSegmentsEvicted(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - - testMD := newTestNSMetadata(t) - start := time.Now().Truncate(time.Hour) - blk, err := NewBlock(start, testMD, BlockOptions{}, - namespace.NewRuntimeOptionsManager("foo"), testOpts) - require.NoError(t, err) - - b, ok := blk.(*block) - require.True(t, ok) - - // empty to start, so shouldn't need eviction - require.False(t, b.NeedsMutableSegmentsEvicted()) - - // perform write and ensure it says it needs eviction - h1 := NewMockOnIndexSeries(ctrl) - h1.EXPECT().OnIndexFinalize(xtime.ToUnixNano(start)) - h1.EXPECT().OnIndexSuccess(xtime.ToUnixNano(start)) - batch := NewWriteBatch(WriteBatchOptions{ - IndexBlockSize: time.Hour, - }) - batch.Append(WriteBatchEntry{ - Timestamp: start.Add(time.Minute), - OnIndexSeries: h1, - }, testDoc1()) - res, err := b.WriteBatch(batch) - require.NoError(t, err) - require.Equal(t, int64(1), res.NumSuccess) - require.Equal(t, int64(0), res.NumError) - - require.True(t, b.NeedsMutableSegmentsEvicted()) -} - -func TestBlockNeedsMutableSegmentsEvictedMutableSegments(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - - testMD := newTestNSMetadata(t) - start := time.Now().Truncate(time.Hour) - blk, err := NewBlock(start, testMD, BlockOptions{}, - namespace.NewRuntimeOptionsManager("foo"), testOpts) - require.NoError(t, err) - - b, ok := blk.(*block) - require.True(t, ok) - - // empty to start, so shouldn't need eviction - require.False(t, b.NeedsMutableSegmentsEvicted()) - seg1 := segment.NewMockMutableSegment(ctrl) - seg1.EXPECT().Size().Return(int64(0)).AnyTimes() - results := result.NewIndexBlockByVolumeType(start) - results.SetBlock(idxpersist.DefaultIndexVolumeType, - result.NewIndexBlock([]result.Segment{result.NewSegment(seg1, true)}, - result.NewShardTimeRangesFromRange(start, start.Add(time.Hour), 1, 2, 3))) - require.NoError(t, b.AddResults(results)) - require.False(t, b.NeedsMutableSegmentsEvicted()) - - seg2 := segment.NewMockMutableSegment(ctrl) - seg2.EXPECT().Size().Return(int64(1)).AnyTimes() - seg3 := segment.NewMockSegment(ctrl) - results = result.NewIndexBlockByVolumeType(start) - results.SetBlock(idxpersist.DefaultIndexVolumeType, - result.NewIndexBlock([]result.Segment{result.NewSegment(seg2, true), result.NewSegment(seg3, true)}, - result.NewShardTimeRangesFromRange(start, start.Add(time.Hour), 1, 2, 4))) - require.NoError(t, b.AddResults(results)) - require.True(t, b.NeedsMutableSegmentsEvicted()) -} - func TestBlockEvictMutableSegmentsSimple(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() diff --git a/src/dbnode/storage/index/index_mock.go b/src/dbnode/storage/index/index_mock.go index 2aede296a2..27afeb409e 100644 --- a/src/dbnode/storage/index/index_mock.go +++ b/src/dbnode/storage/index/index_mock.go @@ -874,20 +874,6 @@ func (mr *MockBlockMockRecorder) IsSealed() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsSealed", reflect.TypeOf((*MockBlock)(nil).IsSealed)) } -// NeedsMutableSegmentsEvicted mocks base method -func (m *MockBlock) NeedsMutableSegmentsEvicted() bool { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "NeedsMutableSegmentsEvicted") - ret0, _ := ret[0].(bool) - return ret0 -} - -// NeedsMutableSegmentsEvicted indicates an expected call of NeedsMutableSegmentsEvicted -func (mr *MockBlockMockRecorder) NeedsMutableSegmentsEvicted() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NeedsMutableSegmentsEvicted", reflect.TypeOf((*MockBlock)(nil).NeedsMutableSegmentsEvicted)) -} - // EvictMutableSegments mocks base method func (m *MockBlock) EvictMutableSegments() error { m.ctrl.T.Helper() @@ -942,19 +928,33 @@ func (mr *MockBlockMockRecorder) RotateColdMutableSegments() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RotateColdMutableSegments", reflect.TypeOf((*MockBlock)(nil).RotateColdMutableSegments)) } -// MemorySegmentsData mocks base method -func (m *MockBlock) MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) { +// AppendMemorySegmentsData mocks base method +func (m *MockBlock) AppendMemorySegmentsData(ctx context.Context, results []fst.SegmentData) ([]fst.SegmentData, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "MemorySegmentsData", ctx) + ret := m.ctrl.Call(m, "AppendMemorySegmentsData", ctx, results) ret0, _ := ret[0].([]fst.SegmentData) ret1, _ := ret[1].(error) return ret0, ret1 } -// MemorySegmentsData indicates an expected call of MemorySegmentsData -func (mr *MockBlockMockRecorder) MemorySegmentsData(ctx interface{}) *gomock.Call { +// AppendMemorySegmentsData indicates an expected call of AppendMemorySegmentsData +func (mr *MockBlockMockRecorder) AppendMemorySegmentsData(ctx, results interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AppendMemorySegmentsData", reflect.TypeOf((*MockBlock)(nil).AppendMemorySegmentsData), ctx, results) +} + +// NumSegments mocks base method +func (m *MockBlock) NumSegments() int { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "NumSegments") + ret0, _ := ret[0].(int) + return ret0 +} + +// NumSegments indicates an expected call of NumSegments +func (mr *MockBlockMockRecorder) NumSegments() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MemorySegmentsData", reflect.TypeOf((*MockBlock)(nil).MemorySegmentsData), ctx) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NumSegments", reflect.TypeOf((*MockBlock)(nil).NumSegments)) } // Close mocks base method diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go index baa904cd7b..329b52d36e 100644 --- a/src/dbnode/storage/index/mutable_segments.go +++ b/src/dbnode/storage/index/mutable_segments.go @@ -29,6 +29,7 @@ import ( "time" "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" "github.com/m3db/m3/src/dbnode/storage/index/compaction" "github.com/m3db/m3/src/dbnode/storage/index/segments" m3ninxindex "github.com/m3db/m3/src/m3ninx/index" @@ -45,8 +46,8 @@ import ( ) var ( - errUnableToWriteBlockConcurrent = errors.New("unable to write, index block is being written to already") errMutableSegmentsAlreadyClosed = errors.New("mutable segments already closed") + errUnableToWriteBlockConcurrent = errors.New("unable to write, index block is being written to already") errForegroundCompactorNoPlan = errors.New("index foreground compactor failed to generate a plan") errForegroundCompactorBadPlanFirstTask = errors.New("index foreground compactor generated plan without mutable segment in first task") errForegroundCompactorBadPlanSecondaryTask = errors.New("index foreground compactor generated plan with mutable segment a secondary task") @@ -67,6 +68,7 @@ type mutableSegments struct { foregroundSegments []*readableSeg backgroundSegments []*readableSeg + onDiskSegments []*readableSeg compact mutableSegmentsCompact blockStart time.Time @@ -196,6 +198,9 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) error { func (m *mutableSegments) AddReaders(readers []segment.Reader) ([]segment.Reader, error) { m.RLock() defer m.RUnlock() + if m.state == mutableSegmentsStateClosed { + return nil, nil + } var err error readers, err = m.addReadersWithLock(m.foregroundSegments, readers) @@ -208,6 +213,11 @@ func (m *mutableSegments) AddReaders(readers []segment.Reader) ([]segment.Reader return nil, err } + readers, err = m.addReadersWithLock(m.onDiskSegments, readers) + if err != nil { + return nil, err + } + return readers, nil } @@ -226,18 +236,23 @@ func (m *mutableSegments) Len() int { m.RLock() defer m.RUnlock() - return len(m.foregroundSegments) + len(m.backgroundSegments) + return len(m.foregroundSegments) + len(m.backgroundSegments) + len(m.onDiskSegments) } -func (m *mutableSegments) MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) { +func (m *mutableSegments) AppendMemorySegmentsData( + ctx context.Context, + results []fst.SegmentData, +) ([]fst.SegmentData, error) { m.RLock() defer m.RUnlock() + if m.state == mutableSegmentsStateClosed { + return nil, nil + } - // NB(r): This is for debug operations, do not bother about allocations. - var results []fst.SegmentData for _, segs := range [][]*readableSeg{ m.foregroundSegments, m.backgroundSegments, + m.onDiskSegments, } { for _, seg := range segs { fstSegment, ok := seg.Segment().(fst.Segment) @@ -260,14 +275,18 @@ func (m *mutableSegments) NeedsEviction() bool { m.RLock() defer m.RUnlock() - var needsEviction bool - for _, seg := range m.foregroundSegments { - needsEviction = needsEviction || seg.Segment().Size() > 0 - } - for _, seg := range m.backgroundSegments { - needsEviction = needsEviction || seg.Segment().Size() > 0 + for _, segs := range [][]*readableSeg{ + m.foregroundSegments, + m.backgroundSegments, + m.onDiskSegments, + } { + for _, seg := range segs { + if seg.Segment().Size() > 0 { + return true + } + } } - return needsEviction + return false } func (m *mutableSegments) NumSegmentsAndDocs() (int64, int64) { @@ -277,13 +296,15 @@ func (m *mutableSegments) NumSegmentsAndDocs() (int64, int64) { var ( numSegments, numDocs int64 ) - for _, seg := range m.foregroundSegments { - numSegments++ - numDocs += seg.Segment().Size() - } - for _, seg := range m.backgroundSegments { - numSegments++ - numDocs += seg.Segment().Size() + for _, segs := range [][]*readableSeg{ + m.foregroundSegments, + m.backgroundSegments, + m.onDiskSegments, + } { + for _, seg := range segs { + numSegments++ + numDocs += seg.Segment().Size() + } } return numSegments, numDocs } @@ -310,6 +331,15 @@ func (m *mutableSegments) Stats(reporter BlockStatsReporter) { Size: seg.Segment().Size(), }) } + for _, seg := range m.onDiskSegments { + _, mutable := seg.Segment().(segment.MutableSegment) + reporter.ReportSegmentStats(BlockSegmentStats{ + Type: FlushedSegment, + Mutable: mutable, + Age: seg.Age(), + Size: seg.Segment().Size(), + }) + } reporter.ReportIndexingStats(BlockIndexingStats{ IndexConcurrency: m.writeIndexingConcurrency, @@ -319,11 +349,32 @@ func (m *mutableSegments) Stats(reporter BlockStatsReporter) { func (m *mutableSegments) Close() { m.Lock() defer m.Unlock() + if m.state == mutableSegmentsStateClosed { + return + } m.state = mutableSegmentsStateClosed m.cleanupCompactWithLock() + m.cleanupOnDiskSegmentsWithLock() m.optsListener.Close() } +// Index block handles locking when adding on disk segments. +func (m *mutableSegments) addOnDiskSegmentsWithoutLock(segments []result.Segment) { + for _, s := range segments { + m.onDiskSegments = append(m.onDiskSegments, newReadableSeg(s.Segment(), m.opts)) + } +} + +func (m *mutableSegments) cleanupOnDiskSegmentsWithLock() { + // Check if need to close all the compacted segments due to + // mutableSegments being closed. + if !m.shouldEvictCompactedSegmentsWithLock() { + return + } + m.closeCompactedSegmentsWithLock(m.onDiskSegments) + m.onDiskSegments = nil +} + func (m *mutableSegments) maybeBackgroundCompactWithLock() { if m.compact.compactingBackground { return @@ -434,6 +485,22 @@ func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) { } } + // Freeze terminal segments. + for _, seg := range plan.UnusedSegments { + if fstSeg, ok := seg.Segment.(fst.Segment); ok { + state, err := fstSeg.State() + if err != nil { + instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) { + l.Error("error freezing terminal segments", zap.Error(err)) + }) + continue + } + if state != fst.FrozenIndexSegmentState { + fstSeg.Freeze() + } + } + } + for i, task := range plan.Tasks { err := m.backgroundCompactWithTask(task, log, logger.With(zap.Int("task", i))) @@ -481,12 +548,12 @@ func (m *mutableSegments) backgroundCompactWithTask( // Add a read through cache for repeated expensive queries against // background compacted segments since they can live for quite some // time and accrue a large set of documents. - if immSeg, ok := compacted.(segment.ImmutableSegment); ok { + if fstSeg, ok := compacted.(fst.Segment); ok { var ( plCache = m.opts.PostingsListCache() readThroughOpts = m.opts.ReadThroughSegmentOptions() ) - compacted = NewReadThroughSegment(immSeg, plCache, readThroughOpts) + compacted = NewReadThroughSegment(fstSeg, plCache, readThroughOpts) } // Rotate out the replaced frozen segments and add the compacted one. diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go index 52aaaf6251..1430c4f529 100644 --- a/src/dbnode/storage/index/read_through_segment.go +++ b/src/dbnode/storage/index/read_through_segment.go @@ -27,7 +27,9 @@ import ( "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/index" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" "github.com/m3db/m3/src/m3ninx/postings" + "github.com/m3db/m3/src/x/context" "github.com/pborman/uuid" ) @@ -37,9 +39,9 @@ var ( errCantCloseClosedSegment = errors.New("cant close closed segment") ) -// Ensure FST segment implements ImmutableSegment so can be casted upwards -// and mmap's can be freed. -var _ segment.ImmutableSegment = (*ReadThroughSegment)(nil) +// Ensure ReadThroughSegment implements fst.Segment so it can be casted upwards +// and mmap's can be freed and we can perform snapshots. +var _ fst.Segment = (*ReadThroughSegment)(nil) // ReadThroughSegment wraps a segment with a postings list cache so that // queries can be transparently cached in a read through manner. In addition, @@ -51,7 +53,7 @@ var _ segment.ImmutableSegment = (*ReadThroughSegment)(nil) type ReadThroughSegment struct { sync.RWMutex - segment segment.ImmutableSegment + segment fst.Segment uuid uuid.UUID postingsListCache *PostingsListCache @@ -72,10 +74,10 @@ type ReadThroughSegmentOptions struct { // NewReadThroughSegment creates a new read through segment. func NewReadThroughSegment( - seg segment.ImmutableSegment, + seg fst.Segment, cache *PostingsListCache, opts ReadThroughSegmentOptions, -) segment.Segment { +) fst.Segment { return &ReadThroughSegment{ segment: seg, opts: opts, @@ -155,6 +157,21 @@ func (r *ReadThroughSegment) Size() int64 { return r.segment.Size() } +// SegmentData returns in memory data for a segment. +func (r *ReadThroughSegment) SegmentData(ctx context.Context) (fst.SegmentData, error) { + return r.segment.SegmentData(ctx) +} + +// Freeze marks the segment state as frozen (no longer compactable). +func (r *ReadThroughSegment) Freeze() { + r.segment.Freeze() +} + +// State returns the segment state (frozen/compactable). +func (r *ReadThroughSegment) State() (fst.IndexSegmentState, error) { + return r.segment.State() +} + type readThroughSegmentReader struct { // reader is explicitly not embedded at the top level // of the struct to force new methods added to index.Reader diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go index 88323e7909..59c19e934b 100644 --- a/src/dbnode/storage/index/types.go +++ b/src/dbnode/storage/index/types.go @@ -386,19 +386,13 @@ type Block interface { // IsSealed returns whether this block was sealed. IsSealed() bool - // NeedsMutableSegmentsEvicted returns whether this block has any mutable segments - // that are not-empty and sealed. - // A sealed non-empty mutable segment needs to get evicted from memory as - // soon as it can be to reduce memory footprint. - NeedsMutableSegmentsEvicted() bool - // EvictMutableSegments closes any mutable segments, this is only applicable // valid to be called once the block and hence mutable segments are sealed. // It is expected that results have been added to the block that covers any // data the mutable segments should have held at this time. EvictMutableSegments() error - // NeedsMutableSegmentsEvicted returns whether this block has any cold mutable segments + // NeedsColdMutableSegmentsEvicted returns whether this block has any cold mutable segments // that are not-empty and sealed. NeedsColdMutableSegmentsEvicted() bool @@ -410,8 +404,14 @@ type Block interface { // new cold mutable segment to write to. RotateColdMutableSegments() - // MemorySegmentsData returns all in memory segments data. - MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) + // AppendMemorySegmentsData appends all in memory segments data to the results. + AppendMemorySegmentsData( + ctx context.Context, + results []fst.SegmentData, + ) ([]fst.SegmentData, error) + + // NumSegments returns the number of index segments. + NumSegments() int // Close will release any held resources and close the Block. Close() error @@ -990,3 +990,38 @@ type Options interface { // QueryLimits returns the current query limits. QueryLimits() limits.QueryLimits } + +// BlockStateSnapshot represents a snapshot of a index block's state at +// a moment in time. +type BlockStateSnapshot struct { + bootstrapped bool + snapshot BootstrappedBlockStateSnapshot +} + +// NewBlockStateSnapshot constructs a new BlockStateSnapshot. +func NewBlockStateSnapshot( + bootstrapped bool, + snapshot BootstrappedBlockStateSnapshot, +) BlockStateSnapshot { + return BlockStateSnapshot{ + bootstrapped: bootstrapped, + snapshot: snapshot, + } +} + +// UnwrapValue returns a BootstrappedBlockStateSnapshot and a boolean indicating whether the +// snapshot is bootstrapped or not. +func (s BlockStateSnapshot) UnwrapValue() (BootstrappedBlockStateSnapshot, bool) { + return s.snapshot, s.bootstrapped +} + +// BootstrappedBlockStateSnapshot represents a bootstrapped block state snapshot. +type BootstrappedBlockStateSnapshot struct { + Snapshot map[xtime.UnixNano]BlockState +} + +// BlockState contains the state of a block. +type BlockState struct { + // SnapshotVersionLoaded represents snapshot version loaded into mem. + SnapshotVersionLoaded int +} diff --git a/src/dbnode/storage/index_test.go b/src/dbnode/storage/index_test.go index c258db46d8..ecd537f076 100644 --- a/src/dbnode/storage/index_test.go +++ b/src/dbnode/storage/index_test.go @@ -37,6 +37,7 @@ import ( "github.com/m3db/m3/src/m3ninx/doc" "github.com/m3db/m3/src/m3ninx/idx" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" idxpersist "github.com/m3db/m3/src/m3ninx/persist" "github.com/m3db/m3/src/x/context" xerrors "github.com/m3db/m3/src/x/errors" @@ -139,6 +140,7 @@ func TestNamespaceIndexCleanupDuplicateFilesets(t *testing.T) { filePathPrefix string, namespace ident.ID, readerBufferSize int, + fileSetType persist.FileSetType, ) []fs.ReadIndexInfoFileResult { return infoFiles } @@ -205,6 +207,7 @@ func TestNamespaceIndexCleanupDuplicateFilesetsNoop(t *testing.T) { filePathPrefix string, namespace ident.ID, readerBufferSize int, + fileSetType persist.FileSetType, ) []fs.ReadIndexInfoFileResult { return infoFiles } @@ -319,6 +322,7 @@ func TestNamespaceIndexFlushShardStateNotSuccess(t *testing.T) { mockFlush := persist.NewMockIndexFlush(ctrl) + // We won't be flushing any snapshots in this error case. require.NoError(t, idx.WarmFlush(mockFlush, shards)) } @@ -396,6 +400,124 @@ func TestNamespaceIndexSetExtendedRetentionPeriod(t *testing.T) { assert.Equal(t, longerRetention, idx.effectiveRetentionPeriodWithLock()) } +func TestNamespaceIndexSnapshotColdBlock(t *testing.T) { + ctrl := gomock.NewController(xtest.Reporter{T: t}) + defer ctrl.Finish() + + test := newTestIndex(t, ctrl) + + now := time.Now().Truncate(test.indexBlockSize) + idx := test.index.(*nsIndex) + + defer func() { + require.NoError(t, idx.Close()) + }() + + var closed bool + snapshotPreparer := persist.NewMockSnapshotPreparer(ctrl) + prepared := persist.PreparedIndexSnapshotPersist{ + Persist: func(fst.SegmentData) error { return nil }, + Close: func() error { closed = true; return nil }, + } + + shards := make(map[uint32]struct{}) + for _, shard := range testShardSet.AllIDs() { + shards[shard] = struct{}{} + } + + blockStart := now.Add(-test.indexBlockSize) + mockBlock := index.NewMockBlock(ctrl) + mockBlock.EXPECT().AppendMemorySegmentsData(gomock.Any(), gomock.Any()).Return([]fst.SegmentData{fst.SegmentData{}}, nil) + mockBlock.EXPECT().IsSealed().Return(true) + mockBlock.EXPECT().NumSegments().Return(1) + mockBlock.EXPECT().StartTime().Return(blockStart) + mockBlock.EXPECT().Close().Return(nil) + idx.state.blocksByTime[xtime.ToUnixNano(blockStart)] = mockBlock + + prepareOpts := xtest.CmpMatcher(persist.IndexPrepareSnapshotOptions{ + IndexPrepareOptions: persist.IndexPrepareOptions{ + NamespaceMetadata: idx.nsMetadata, + Shards: shards, + BlockStart: blockStart, + FileSetType: persist.FileSetSnapshotType, + IndexVolumeType: idxpersist.SnapshotColdIndexVolumeType, + }, + SnapshotTime: now, + }) + snapshotPreparer.EXPECT().PrepareIndexSnapshot(prepareOpts).Return(prepared, nil) + + require.NoError(t, idx.Snapshot( + shards, + blockStart, + now, + snapshotPreparer, + []fs.ReadIndexInfoFileResult{ + { + ID: fs.FileSetFileIdentifier{ + BlockStart: blockStart, + }, + }, + }, + )) + require.True(t, closed) +} + +func TestNamespaceIndexSnapshotWarmBlock(t *testing.T) { + ctrl := gomock.NewController(xtest.Reporter{T: t}) + defer ctrl.Finish() + + test := newTestIndex(t, ctrl) + + now := time.Now().Truncate(test.indexBlockSize) + idx := test.index.(*nsIndex) + + defer func() { + require.NoError(t, idx.Close()) + }() + + var closed bool + snapshotPreparer := persist.NewMockSnapshotPreparer(ctrl) + prepared := persist.PreparedIndexSnapshotPersist{ + Persist: func(fst.SegmentData) error { return nil }, + Close: func() error { closed = true; return nil }, + } + + shards := make(map[uint32]struct{}) + for _, shard := range testShardSet.AllIDs() { + shards[shard] = struct{}{} + } + + blockStart := now.Add(-test.indexBlockSize) + mockBlock := index.NewMockBlock(ctrl) + mockBlock.EXPECT().AppendMemorySegmentsData(gomock.Any(), gomock.Any()).Return([]fst.SegmentData{fst.SegmentData{}}, nil) + mockBlock.EXPECT().IsSealed().Return(false) + mockBlock.EXPECT().NumSegments().Return(1) + mockBlock.EXPECT().StartTime().Return(blockStart).AnyTimes() + mockBlock.EXPECT().Close().Return(nil) + idx.state.blocksByTime[xtime.ToUnixNano(blockStart)] = mockBlock + + prepareOpts := xtest.CmpMatcher(persist.IndexPrepareSnapshotOptions{ + IndexPrepareOptions: persist.IndexPrepareOptions{ + NamespaceMetadata: idx.nsMetadata, + Shards: shards, + BlockStart: blockStart, + FileSetType: persist.FileSetSnapshotType, + IndexVolumeType: idxpersist.SnapshotWarmIndexVolumeType, + }, + SnapshotTime: now, + }) + snapshotPreparer.EXPECT().PrepareIndexSnapshot(prepareOpts).Return(prepared, nil) + + require.NoError(t, idx.Snapshot( + shards, + blockStart, + now, + snapshotPreparer, + []fs.ReadIndexInfoFileResult{}, + )) + require.True(t, closed) +} + func verifyFlushForShards( t *testing.T, ctrl *gomock.Controller, @@ -427,6 +549,7 @@ func verifyFlushForShards( shardMap[shard] = struct{}{} dbShards = append(dbShards, mockShard) } + earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(idx.retentionPeriod, idx.blockSize, now) for blockStart := earliestBlockStartToRetain; blockStart.Before(warmBlockStart); blockStart = blockStart.Add(idx.blockSize) { numBlocks++ @@ -448,11 +571,11 @@ func verifyFlushForShards( actualDocs = append(actualDocs, b.Docs()...) return nil } - preparedPersist := persist.PreparedIndexPersist{ + preparedPersist := persist.PreparedIndexFlushPersist{ Close: closer, Persist: persistFn, } - mockFlush.EXPECT().PrepareIndex(xtest.CmpMatcher(persist.IndexPrepareOptions{ + mockFlush.EXPECT().PrepareIndexFlush(xtest.CmpMatcher(persist.IndexPrepareOptions{ NamespaceMetadata: idx.nsMetadata, BlockStart: blockStart, FileSetType: persist.FileSetFlushType, diff --git a/src/dbnode/storage/namespace.go b/src/dbnode/storage/namespace.go index 6465df365d..88fc466e0a 100644 --- a/src/dbnode/storage/namespace.go +++ b/src/dbnode/storage/namespace.go @@ -116,7 +116,7 @@ type dbNamespace struct { nopts namespace.Options seriesOpts series.Options nowFn clock.NowFn - snapshotFilesFn snapshotFilesFn + snapshotFilesFn fs.SnapshotFilesFn log *zap.Logger bootstrapState BootstrapState @@ -1363,6 +1363,7 @@ func (n *dbNamespace) Snapshot( blockStart, snapshotTime time.Time, snapshotPersist persist.SnapshotPreparer, + infoFiles []fs.ReadIndexInfoFileResult, ) error { // NB(rartoul): This value can be used for emitting metrics, but should not be used // for business logic. @@ -1390,6 +1391,7 @@ func (n *dbNamespace) Snapshot( var ( seriesPersist int multiErr xerrors.MultiError + shardIDs = make(map[uint32]struct{}, len(n.OwnedShards())) ) for _, shard := range n.OwnedShards() { result, err := shard.Snapshot(blockStart, snapshotTime, snapshotPersist, nsCtx) @@ -1400,10 +1402,22 @@ func (n *dbNamespace) Snapshot( } seriesPersist += result.SeriesPersist + + shardIDs[shard.ID()] = struct{}{} } n.metrics.snapshotSeriesPersist.Inc(int64(seriesPersist)) + if idx := n.reverseIndex; idx != nil { + multiErr = multiErr.Add(idx.Snapshot( + shardIDs, + blockStart, + snapshotTime, + snapshotPersist, + infoFiles, + )) + } + res := multiErr.FinalError() n.metrics.snapshot.ReportSuccessOrError(res, n.nowFn().Sub(callStart)) return res diff --git a/src/dbnode/storage/namespace_bootstrap_data_accumulator_test.go b/src/dbnode/storage/namespace_bootstrap_data_accumulator_test.go index ab24c1a2e8..943acca3c5 100644 --- a/src/dbnode/storage/namespace_bootstrap_data_accumulator_test.go +++ b/src/dbnode/storage/namespace_bootstrap_data_accumulator_test.go @@ -25,7 +25,6 @@ import ( "testing" "github.com/m3db/m3/src/dbnode/storage/bootstrap" - "github.com/m3db/m3/src/dbnode/storage/series" "github.com/m3db/m3/src/x/ident" xtest "github.com/m3db/m3/src/x/test" @@ -83,7 +82,7 @@ func testCheckoutSeries(t *testing.T, checkoutFn checkoutFn) { defer ctrl.Finish() var ( ns = NewMockdatabaseNamespace(ctrl) - series = series.NewMockDatabaseSeries(ctrl) + series = bootstrap.NewMockSeriesRef(ctrl) acc = NewDatabaseNamespaceDataAccumulator(ns) shardID = uint32(7) @@ -138,7 +137,7 @@ func testAccumulatorRelease(t *testing.T, checkoutFn checkoutFn) { release = &releaser{} ref = SeriesReadWriteRef{ UniqueIndex: uniqueIdx, - Series: series.NewMockDatabaseSeries(ctrl), + Series: bootstrap.NewMockSeriesRef(ctrl), ReleaseReadWriteRef: release, } ) diff --git a/src/dbnode/storage/namespace_test.go b/src/dbnode/storage/namespace_test.go index 978ca641bb..3f8c8b59e1 100644 --- a/src/dbnode/storage/namespace_test.go +++ b/src/dbnode/storage/namespace_test.go @@ -30,6 +30,7 @@ import ( "github.com/m3db/m3/src/cluster/shard" "github.com/m3db/m3/src/dbnode/namespace" + "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/retention" "github.com/m3db/m3/src/dbnode/runtime" "github.com/m3db/m3/src/dbnode/sharding" @@ -554,7 +555,7 @@ func TestNamespaceSnapshotNotBootstrapped(t *testing.T) { blockSize := ns.Options().RetentionOptions().BlockSize() blockStart := time.Now().Truncate(blockSize) - require.Equal(t, errNamespaceNotBootstrapped, ns.Snapshot(blockStart, blockStart, nil)) + require.Equal(t, errNamespaceNotBootstrapped, ns.Snapshot(blockStart, blockStart, nil, []fs.ReadIndexInfoFileResult{})) } func TestNamespaceSnapshotAllShardsSuccess(t *testing.T) { @@ -606,6 +607,8 @@ func testSnapshotWithShardSnapshotErrs( ns, closer := newTestNamespaceWithIDOpts(t, defaultTestNs1ID, namespace.NewOptions().SetSnapshotEnabled(true)) defer closer() + idx := NewMockNamespaceIndex(ctrl) + ns.reverseIndex = idx ns.bootstrapState = Bootstrapped now := time.Now() ns.nowFn = func() time.Time { @@ -616,6 +619,7 @@ func testSnapshotWithShardSnapshotErrs( shardBootstrapStates = ShardBootstrapStates{} blockSize = ns.Options().RetentionOptions().BlockSize() blockStart = now.Truncate(blockSize) + shardIDs = make(map[uint32]struct{}) ) for i, tc := range shardMethodResults { @@ -629,9 +633,11 @@ func testSnapshotWithShardSnapshotErrs( } ns.shards[testShardIDs[i].ID()] = shard shardBootstrapStates[shardID] = tc.shardBootstrapStateBeforeTick + shardIDs[shardID] = struct{}{} } + idx.EXPECT().Snapshot(shardIDs, blockStart, now, gomock.Any(), gomock.Any()).Return(nil) - return ns.Snapshot(blockStart, now, nil) + return ns.Snapshot(blockStart, now, nil, []fs.ReadIndexInfoFileResult{}) } func TestNamespaceTruncate(t *testing.T) { diff --git a/src/dbnode/storage/readonly_index_proxy.go b/src/dbnode/storage/readonly_index_proxy.go index fec2bc8eea..bff3f6f86f 100644 --- a/src/dbnode/storage/readonly_index_proxy.go +++ b/src/dbnode/storage/readonly_index_proxy.go @@ -25,6 +25,7 @@ import ( "time" "github.com/m3db/m3/src/dbnode/persist" + "github.com/m3db/m3/src/dbnode/persist/fs" "github.com/m3db/m3/src/dbnode/sharding" "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" "github.com/m3db/m3/src/dbnode/storage/index" @@ -119,6 +120,20 @@ func (r readOnlyIndexProxy) DebugMemorySegments(opts DebugMemorySegmentsOptions) return r.underlying.DebugMemorySegments(opts) } +func (r readOnlyIndexProxy) BlockStatesSnapshot() index.BlockStateSnapshot { + return index.NewBlockStateSnapshot(false, index.BootstrappedBlockStateSnapshot{}) +} + +func (r readOnlyIndexProxy) Snapshot( + shards map[uint32]struct{}, + blockStart, + snapshotTime time.Time, + snapshotPersist persist.SnapshotPreparer, + infoFiles []fs.ReadIndexInfoFileResult, +) error { + return nil +} + func (r readOnlyIndexProxy) Close() error { return nil } diff --git a/src/dbnode/storage/series/lookup/entry.go b/src/dbnode/storage/series/lookup/entry.go index 8e1917524e..ed78030d29 100644 --- a/src/dbnode/storage/series/lookup/entry.go +++ b/src/dbnode/storage/series/lookup/entry.go @@ -229,12 +229,19 @@ func (entry *Entry) LoadBlock( block block.DatabaseBlock, writeType series.WriteType, ) error { - // TODO(bodu): We can remove this once we have index snapshotting as index snapshots will - // contained snapshotted index segments that cover snapshotted data. + return entry.Series.LoadBlock(block, writeType) +} + +// LoadBlockAndIndex loads a single block into the series and attempts to index the series +// if not already attempted. +func (entry *Entry) LoadBlockAndIndex( + block block.DatabaseBlock, + writeType series.WriteType, +) error { if err := entry.maybeIndex(block.StartTime()); err != nil { return err } - return entry.Series.LoadBlock(block, writeType) + return entry.LoadBlock(block, writeType) } func (entry *Entry) maybeIndex(timestamp time.Time) error { diff --git a/src/dbnode/storage/series/types.go b/src/dbnode/storage/series/types.go index cf434c8e30..a0ccd8b43a 100644 --- a/src/dbnode/storage/series/types.go +++ b/src/dbnode/storage/series/types.go @@ -137,7 +137,7 @@ type DatabaseSeries interface { // NumActiveBlocks returns the number of active blocks the series currently holds. NumActiveBlocks() int - /// LoadBlock loads a single block into the series. + // LoadBlock loads a single block into the series. LoadBlock( block block.DatabaseBlock, writeType WriteType, diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go index 292a5dfb05..1f88dbea64 100644 --- a/src/dbnode/storage/shard.go +++ b/src/dbnode/storage/shard.go @@ -179,7 +179,7 @@ type dbShard struct { filesetsFn filesetsFn filesetPathsBeforeFn filesetPathsBeforeFn deleteFilesFn deleteFilesFn - snapshotFilesFn snapshotFilesFn + snapshotFilesFn fs.SnapshotFilesFn sleepFn func(time.Duration) identifierPool ident.Pool contextPool context.Pool diff --git a/src/dbnode/storage/storage_mock.go b/src/dbnode/storage/storage_mock.go index feb8fbef7f..a8416e8f16 100644 --- a/src/dbnode/storage/storage_mock.go +++ b/src/dbnode/storage/storage_mock.go @@ -1632,17 +1632,17 @@ func (mr *MockdatabaseNamespaceMockRecorder) ColdFlush(flush interface{}) *gomoc } // Snapshot mocks base method -func (m *MockdatabaseNamespace) Snapshot(blockStart, snapshotTime time.Time, flush persist.SnapshotPreparer) error { +func (m *MockdatabaseNamespace) Snapshot(blockStart, snapshotTime time.Time, flush persist.SnapshotPreparer, infoFiles []fs.ReadIndexInfoFileResult) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "Snapshot", blockStart, snapshotTime, flush) + ret := m.ctrl.Call(m, "Snapshot", blockStart, snapshotTime, flush, infoFiles) ret0, _ := ret[0].(error) return ret0 } // Snapshot indicates an expected call of Snapshot -func (mr *MockdatabaseNamespaceMockRecorder) Snapshot(blockStart, snapshotTime, flush interface{}) *gomock.Call { +func (mr *MockdatabaseNamespaceMockRecorder) Snapshot(blockStart, snapshotTime, flush, infoFiles interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Snapshot", reflect.TypeOf((*MockdatabaseNamespace)(nil).Snapshot), blockStart, snapshotTime, flush) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Snapshot", reflect.TypeOf((*MockdatabaseNamespace)(nil).Snapshot), blockStart, snapshotTime, flush, infoFiles) } // NeedsFlush mocks base method @@ -2629,6 +2629,34 @@ func (mr *MockNamespaceIndexMockRecorder) DebugMemorySegments(opts interface{}) return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DebugMemorySegments", reflect.TypeOf((*MockNamespaceIndex)(nil).DebugMemorySegments), opts) } +// Snapshot mocks base method +func (m *MockNamespaceIndex) Snapshot(shards map[uint32]struct{}, blockStart, snapshotTime time.Time, snapshotPersist persist.SnapshotPreparer, infoFiles []fs.ReadIndexInfoFileResult) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Snapshot", shards, blockStart, snapshotTime, snapshotPersist, infoFiles) + ret0, _ := ret[0].(error) + return ret0 +} + +// Snapshot indicates an expected call of Snapshot +func (mr *MockNamespaceIndexMockRecorder) Snapshot(shards, blockStart, snapshotTime, snapshotPersist, infoFiles interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Snapshot", reflect.TypeOf((*MockNamespaceIndex)(nil).Snapshot), shards, blockStart, snapshotTime, snapshotPersist, infoFiles) +} + +// BlockStatesSnapshot mocks base method +func (m *MockNamespaceIndex) BlockStatesSnapshot() index.BlockStateSnapshot { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "BlockStatesSnapshot") + ret0, _ := ret[0].(index.BlockStateSnapshot) + return ret0 +} + +// BlockStatesSnapshot indicates an expected call of BlockStatesSnapshot +func (mr *MockNamespaceIndexMockRecorder) BlockStatesSnapshot() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BlockStatesSnapshot", reflect.TypeOf((*MockNamespaceIndex)(nil).BlockStatesSnapshot)) +} + // Close mocks base method func (m *MockNamespaceIndex) Close() error { m.ctrl.T.Helper() diff --git a/src/dbnode/storage/types.go b/src/dbnode/storage/types.go index b7753bc9f8..59181c68f6 100644 --- a/src/dbnode/storage/types.go +++ b/src/dbnode/storage/types.go @@ -429,9 +429,7 @@ type databaseNamespace interface { WarmFlush(blockStart time.Time, flush persist.FlushPreparer) error // FlushIndex flushes in-memory index data. - FlushIndex( - flush persist.IndexFlush, - ) error + FlushIndex(flush persist.IndexFlush) error // ColdFlush flushes unflushed in-memory ColdWrites. ColdFlush( @@ -439,7 +437,12 @@ type databaseNamespace interface { ) error // Snapshot snapshots unflushed in-memory WarmWrites. - Snapshot(blockStart, snapshotTime time.Time, flush persist.SnapshotPreparer) error + Snapshot( + blockStart, + snapshotTime time.Time, + flush persist.SnapshotPreparer, + infoFiles []fs.ReadIndexInfoFileResult, + ) error // NeedsFlush returns true if the namespace needs a flush for the // period: [start, end] (both inclusive). @@ -633,7 +636,7 @@ type databaseShard interface { onFlush persist.OnFlushSeries, ) (ShardColdFlush, error) - // Snapshot snapshot's the unflushed WarmWrites in this shard. + // Snapshot snapshots the unflushed WarmWrites in this shard. Snapshot( blockStart time.Time, snapshotStart time.Time, @@ -783,6 +786,19 @@ type NamespaceIndex interface { // DebugMemorySegments allows for debugging memory segments. DebugMemorySegments(opts DebugMemorySegmentsOptions) error + // Snapshot in-memory index data to disk for faster bootstrapping. + Snapshot( + shards map[uint32]struct{}, + blockStart, + snapshotTime time.Time, + snapshotPersist persist.SnapshotPreparer, + infoFiles []fs.ReadIndexInfoFileResult, + ) error + + // BlockStatesSnapshot returns per index block state. Currently, the per + // block state only captures the loaded index snaphot version (if any). + BlockStatesSnapshot() index.BlockStateSnapshot + // Close will release the index resources and close the index. Close() error } diff --git a/src/docs/dbnode/bootstrapping.md b/src/docs/dbnode/bootstrapping.md new file mode 100644 index 0000000000..4282ac4060 --- /dev/null +++ b/src/docs/dbnode/bootstrapping.md @@ -0,0 +1,92 @@ +# bootstrapper + +Bootstrapping is done in the following order: + - fs + - commitlog + - peers + +Each bootstrapper runs through two phases (two shard time ranges). The first phase is cold data and the second phase is warm data. + +## fs bootstrapper + +The fs bootstrapper inspects both data and index info files on disk and marks requested shard time ranges +for each as fulfilled respectively when it finds persisted data for either. + +The fs bootstrapper *only* bootstraps from persisted data (both index and TSDB) on disk. It passes along unfulfilled +shard time ranges to the next bootstrapper (as does every bootstrapper). + +### TSDB data on disk missing index data + +There are a few cases where TSDB blocks on disk may be missing an index block on disk. + 1. TSDB blocks smaller than index blocks + - TSDB blocks can exist on disk that don't cover the entire index block + 2. Node crash between a succesful warm TSDB flush(es) (for the entire index block) + and successful index flush. + 3. Crash after streaming TSDB blocks from peer during peers bootstrapping. + - Shard will still be marked as "initializing". + +We handle case 1 by relying on bootstrapping from index snapshots/commitlogs to load in-mem index data. +We handle case 2 by just waiting for a successful index warm flush to complete. +We handle case 3 by passing unfulfilled index shard time ranges along to the peers bootstrapper (for uninitialized shards) so that it can build index segments for TSDB blocks missing index data. + +Also, w.r.t. cold flush, we don't write out checkpoint files for cold flushed TSDB data until +index data has been successfully persisted to disk so cold flushed data will never be missing index data. + +Additional notes on when data becomes visible on disk: + - TSDB data becomes visible on disk after each shard.WarmFlush() op completes successfully + - Index data becomes visible on disk after each index.flushBlock() op completes successfully + +## commitlog bootstrapper + +The commit log bootstrapper bootstraps both data and index snapshots across all shard time ranges (up to retention). +It bootstraps commit logs only once (commit log bootstrap results are cached after a single run). + +The commitlog does not bootstrap any data for "initializing" shards and will not mark any shard time ranges for these shards as fulfilled. + +## peers bootstrapper + +The peers bootstrapper bootstraps shard data for "initializing" shards during topology changes (when a node has received new shards). +It also is used to bootstrap any remaining data from peers and for a full node joins. + +The peers bootstrapper currently only fetches TSDB blocks from peers. It will either explicitly index these series when loading +series blocks which happens during the second bootstrapper phase (warm phase) or build and normally persist index segments for +the fetched TSDB blocks. + +## Cache policies + +The tasks carried out by each bootstrapper vary a lot on the series cache policy being used. + +### CacheAll series cache policy + +For the cache all policy the filesystem bootstrapper will load all series and all the data for each block and return the entire set of data. This will keep every series and series block on heap. + +The peers bootstrapper similarly bootstraps all the data from peers that the filesystem does not have and returns the entire set of data fetched. + +### RecentlyRead series cache policy + +For the recently read policy the filesystem bootstrapper will simply fulfill the time ranges requested matching without actually loading the series and blocks from the files it discovers. This relies on data been fetched lazily from the filesystem when data is required for a series that does not live on heap. + +The peers bootstrapper will bootstrap all time ranges requested, and if performing a bootstrap with persistence enabled for a time range, will write the data to disk and then remove the results from memory. A bootstrap with persistence enabled is used for any data that is immutable at the time that bootstrapping commences. For time ranges that are mutable the peer bootstrapper will still write the data out to disk in a durable manner, but in the form of a snapshot, and the series and blocks will still be returned directly as a result from the bootstrapper. This enables the commit log bootstrapper to recover the data in case the node shuts down before the in-memory data can be flushed. + +## Topology changes + +When nodes are added to a replication group, shards are given away to the joining node. Those shards are closed and we re-bootstrap with the shards that we own. +When nodes are removed from a replication group, shards from the removed node are given to remaining nodes in a replication group. The remaining nodes in the replication group will bootstrap the "new" shards that were assigned to it. +Note that we also take writes for shards that we own while bootstrapping. However, we do not allow warm/cold flushes to happen while bootstrapping. + +For example, see the following sequences: +(Node add) +- Node 1: + - Initial bootstrap (256 shards) + - Node add + - Bootstrap (128 shards) // These are the remaining shards it owns. +- Node 2: + - Node add + - Inital bootstrap (128 shards) // These are received from Node 1 + +(Node remove) +- Node 1: + - Node remove + - Bootstrap (128 shards) // These are received form Node 2, it owns 256 now. +- Node 2: + - Node remove diff --git a/src/dbnode/client/README.md b/src/docs/dbnode/client.md similarity index 100% rename from src/dbnode/client/README.md rename to src/docs/dbnode/client.md diff --git a/src/dbnode/persist/fs/commitlog/README.md b/src/docs/dbnode/commitlog.md similarity index 100% rename from src/dbnode/persist/fs/commitlog/README.md rename to src/docs/dbnode/commitlog.md diff --git a/src/dbnode/digest/README.md b/src/docs/dbnode/digest.md similarity index 71% rename from src/dbnode/digest/README.md rename to src/docs/dbnode/digest.md index 361df3407c..9b210d9044 100644 --- a/src/dbnode/digest/README.md +++ b/src/docs/dbnode/digest.md @@ -1,6 +1,6 @@ # digest -This package consolidates all our digest algorithms used for data integrity into a single place. Adler32 is used and dependendent on the use case we rely on the standard library or a modified rolling hash version that can be stack allocated. +This contains some notes on all the digest algorithms used for data integrity in the digest package. Adler32 is used and dependendent on the use case we rely on the standard library or a modified rolling hash version that can be stack allocated. For highly concurrent callsites that require digests, they use the stack based adler32 library to avoid having to pool digest structs. The stack based adler32 library is a few percent slower than the standard library adler32 algorithm but heap allocation free. diff --git a/src/dbnode/storage/README.md b/src/docs/dbnode/flush.md similarity index 66% rename from src/dbnode/storage/README.md rename to src/docs/dbnode/flush.md index 0c54d521f1..11f0aa021b 100644 --- a/src/dbnode/storage/README.md +++ b/src/docs/dbnode/flush.md @@ -1,20 +1,31 @@ -# storage +# Flush -Storage related documentation. +Index and data flush documentation. -## Flush consistency model +## Consistency model -Flush occurs in the following steps: +Warm and cold flush each have their own independent lifecycle: + +Warm flush: + - warm flush cleanup + - expired/duplicate index files + - inactive snapshot/namespace files. + - index/data snapshots - data warm flush - rotate commit log + - data and index snapshot + - drops rotated commit log when we are done + - index warm flush + +Cold flush: + - cold flush cleanup + - out of retention/compacted data files + - data for no longer owned shards - data cold flush - rotate cold mutable index segments - flush cold tsdb data and write most files to disk (except checkpoint files) - flush cold index data to disk and reload - evict rotated cold mutable index segments - write tsdb checkpoint files (completes the tsdb cold flush lifecycle) - - data snapshot - - drops rotated commit log when we are done - - index flush Since we rotate the commit log before we perform a data cold flush and only drop the rotate commit log after data snapshotting is done we guarantee that no writes will be lost if the node crashes. After data cold flush completes, any new cold writes will exist in the active commit log (and not be dropped) when data snapshotting finishes. This is why data snapshotting only needs to snapshot warm data blocks (that need to be flushed). diff --git a/src/dbnode/storage/index/README.md b/src/docs/dbnode/index.md similarity index 100% rename from src/dbnode/storage/index/README.md rename to src/docs/dbnode/index.md diff --git a/src/dbnode/encoding/proto/docs/README.md b/src/docs/dbnode/proto/README.md similarity index 100% rename from src/dbnode/encoding/proto/docs/README.md rename to src/docs/dbnode/proto/README.md diff --git a/src/dbnode/encoding/proto/docs/encoding.md b/src/docs/dbnode/proto/encoding.md similarity index 100% rename from src/dbnode/encoding/proto/docs/encoding.md rename to src/docs/dbnode/proto/encoding.md diff --git a/src/dbnode/encoding/proto/docs/marshal.md b/src/docs/dbnode/proto/marshal.md similarity index 100% rename from src/dbnode/encoding/proto/docs/marshal.md rename to src/docs/dbnode/proto/marshal.md diff --git a/src/dbnode/encoding/proto/docs/unmarshal.md b/src/docs/dbnode/proto/unmarshal.md similarity index 100% rename from src/dbnode/encoding/proto/docs/unmarshal.md rename to src/docs/dbnode/proto/unmarshal.md diff --git a/src/docs/dbnode/roadmap.md b/src/docs/dbnode/roadmap.md new file mode 100644 index 0000000000..97205f08cf --- /dev/null +++ b/src/docs/dbnode/roadmap.md @@ -0,0 +1,7 @@ +# Roadmap + +Upcoming changes will be added here. + +## Upcoming changes + +- Migrating to 1:1 sizing of index and TSDB blocks. Initially we are restricting creation of new namespaces to 1:1 sizing. diff --git a/src/dbnode/storage/series/README.md b/src/docs/dbnode/series.md similarity index 100% rename from src/dbnode/storage/series/README.md rename to src/docs/dbnode/series.md diff --git a/src/docs/dbnode/snapshotting.md b/src/docs/dbnode/snapshotting.md new file mode 100644 index 0000000000..465aa5addb --- /dev/null +++ b/src/docs/dbnode/snapshotting.md @@ -0,0 +1,30 @@ +# Snapshotting + +Index and data snapshotting documentation + +## Consistency model + +Both data and index snapshotting happens in the warm flush life cycle. Each snapshot run is assigned a UUID for snapshot ID which is written to a snapshot metdata file. A snapshot metadata file is written each snapshot run regardless of whether or not any index or data filesets were written to disk. For each block start within retention, we snapshot any in-memory data (no work is done if there is no data in-mem). + +We perform data and index snapshot cleanup before we perform snapshotting in the warm flush lifecycle. The cleanup and snapshot processes cannot be run concurrently as the cleanup process relies on the latest snapshot UUID to perform cleanup. The snapshot cleanup logic for index and data snapshots are as follows: + +- Data snapshots: Delete everything but the latest snapshot UUID. +- Index snapshots: Delete everything up to the snapshot version loaded into memory (happens when we bootstrap from index snapshots) if set or delete everything but the latest snapshot UUID. + +## Eviction of loaded index snapshots + +There are both warm and cold snapshots which are differentiated by their index volume type. +``` +// SnapshotColdIndexVolumeType holds cold index snapshot data. +SnapshotColdIndexVolumeType IndexVolumeType = "snapshot_cold" +// SnapshotWarmIndexVolumeType holds warm index snapshot data. +SnapshotWarmIndexVolumeType IndexVolumeType = "snapshot_warm" +``` + +Warm snapshots are evicted from memory on a per block basis at the end of a warm flush for an index block and cold snapshots are evicted at the end of a cold flush for an index block. + +## Stale index snapshots + +We can have stale index snapshots in between when a successful cold and/or warm flush occurs and when the next index snapshot gets written to disk. + +For cold index blocks, we will flush the stale snapshot from memory during the next index cold flush. For warm index blocks, we check to see if the index block has been sealed and if a successful warm flush has occured when we add bootstrap results in the index. If we've already successfully warm flushed a block that means any warm index snapshots are considered "stale" and are not loaded as they will never get flushed if they do. diff --git a/src/m3ninx/doc/doc_mock.go b/src/m3ninx/doc/doc_mock.go index 0cec3d8619..592021f2da 100644 --- a/src/m3ninx/doc/doc_mock.go +++ b/src/m3ninx/doc/doc_mock.go @@ -1,7 +1,7 @@ // Code generated by MockGen. DO NOT EDIT. // Source: github.com/m3db/m3/src/m3ninx/doc/types.go -// Copyright (c) 2018 Uber Technologies, Inc. +// Copyright (c) 2020 Uber Technologies, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/src/m3ninx/index/segment/fst/fst_mock.go b/src/m3ninx/index/segment/fst/fst_mock.go index 536ab55522..a81370cb1f 100644 --- a/src/m3ninx/index/segment/fst/fst_mock.go +++ b/src/m3ninx/index/segment/fst/fst_mock.go @@ -278,6 +278,18 @@ func (mr *MockSegmentMockRecorder) FreeMmap() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FreeMmap", reflect.TypeOf((*MockSegment)(nil).FreeMmap)) } +// Freeze mocks base method +func (m *MockSegment) Freeze() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Freeze") +} + +// Freeze indicates an expected call of Freeze +func (mr *MockSegmentMockRecorder) Freeze() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Freeze", reflect.TypeOf((*MockSegment)(nil).Freeze)) +} + // Reader mocks base method func (m *MockSegment) Reader() (segment.Reader, error) { m.ctrl.T.Helper() @@ -322,6 +334,21 @@ func (mr *MockSegmentMockRecorder) Size() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Size", reflect.TypeOf((*MockSegment)(nil).Size)) } +// State mocks base method +func (m *MockSegment) State() (IndexSegmentState, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "State") + ret0, _ := ret[0].(IndexSegmentState) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// State indicates an expected call of State +func (mr *MockSegmentMockRecorder) State() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "State", reflect.TypeOf((*MockSegment)(nil).State)) +} + // TermsIterable mocks base method func (m *MockSegment) TermsIterable() segment.TermsIterable { m.ctrl.T.Helper() diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go index 7d0efdd0a5..2cd4fa7560 100644 --- a/src/m3ninx/index/segment/fst/segment.go +++ b/src/m3ninx/index/segment/fst/segment.go @@ -57,10 +57,21 @@ var ( errFSTFieldsDataUnset = errors.New("fst fields data bytes are not set") ) +// IndexSegmentState is the state of an index segment. +type IndexSegmentState int + +const ( + // CompactableIndexSegmentState is a still compactable index segment. + CompactableIndexSegmentState IndexSegmentState = iota + // FrozenIndexSegmentState is a no longer compactable frozen index segment. + FrozenIndexSegmentState +) + // SegmentData represent the collection of required parameters to construct a Segment. type SegmentData struct { Version Version Metadata []byte + State IndexSegmentState DocsData mmap.Descriptor DocsIdxData mmap.Descriptor @@ -195,6 +206,24 @@ func (r *fsSegment) SegmentData(ctx context.Context) (SegmentData, error) { return r.data, nil } +func (r *fsSegment) Freeze() { + r.Lock() + defer r.Unlock() + if r.closed { + return + } + r.data.State = FrozenIndexSegmentState +} + +func (r *fsSegment) State() (IndexSegmentState, error) { + r.RLock() + defer r.RUnlock() + if r.closed { + return 0, errReaderClosed + } + return r.data.State, nil +} + func (r *fsSegment) Size() int64 { r.RLock() defer r.RUnlock() diff --git a/src/m3ninx/index/segment/fst/types.go b/src/m3ninx/index/segment/fst/types.go index 1ba27bf740..dc6c64c2e9 100644 --- a/src/m3ninx/index/segment/fst/types.go +++ b/src/m3ninx/index/segment/fst/types.go @@ -61,6 +61,12 @@ type Segment interface { // Note: Must close context when done with the data // so that can resources can be free'd safely. SegmentData(ctx context.Context) (SegmentData, error) + + // Freeze is used to freeze an fst seg. This op is done when the + // fst seg has reached a terminal state in the compaction process. + Freeze() + // State returns the state of the underlying fst seg (either compactable or frozen). + State() (IndexSegmentState, error) } // Writer writes out a FST segment from the provided elements. diff --git a/src/m3ninx/persist/persist_mock.go b/src/m3ninx/persist/persist_mock.go index 7776be1956..e32b9266a2 100644 --- a/src/m3ninx/persist/persist_mock.go +++ b/src/m3ninx/persist/persist_mock.go @@ -29,6 +29,7 @@ import ( "reflect" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" "github.com/m3db/m3/src/x/mmap" "github.com/golang/mock/gomock" @@ -150,6 +151,20 @@ func (mr *MockIndexSegmentFileSetWriterMockRecorder) SegmentMetadata() *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentMetadata", reflect.TypeOf((*MockIndexSegmentFileSetWriter)(nil).SegmentMetadata)) } +// SegmentState mocks base method +func (m *MockIndexSegmentFileSetWriter) SegmentState() fst.IndexSegmentState { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentState") + ret0, _ := ret[0].(fst.IndexSegmentState) + return ret0 +} + +// SegmentState indicates an expected call of SegmentState +func (mr *MockIndexSegmentFileSetWriterMockRecorder) SegmentState() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentState", reflect.TypeOf((*MockIndexSegmentFileSetWriter)(nil).SegmentState)) +} + // Files mocks base method func (m *MockIndexSegmentFileSetWriter) Files() []IndexSegmentFileType { m.ctrl.T.Helper() @@ -257,6 +272,20 @@ func (mr *MockMutableSegmentFileSetWriterMockRecorder) SegmentMetadata() *gomock return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentMetadata", reflect.TypeOf((*MockMutableSegmentFileSetWriter)(nil).SegmentMetadata)) } +// SegmentState mocks base method +func (m *MockMutableSegmentFileSetWriter) SegmentState() fst.IndexSegmentState { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentState") + ret0, _ := ret[0].(fst.IndexSegmentState) + return ret0 +} + +// SegmentState indicates an expected call of SegmentState +func (mr *MockMutableSegmentFileSetWriterMockRecorder) SegmentState() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentState", reflect.TypeOf((*MockMutableSegmentFileSetWriter)(nil).SegmentState)) +} + // Files mocks base method func (m *MockMutableSegmentFileSetWriter) Files() []IndexSegmentFileType { m.ctrl.T.Helper() @@ -299,6 +328,141 @@ func (mr *MockMutableSegmentFileSetWriterMockRecorder) Reset(arg0 interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reset", reflect.TypeOf((*MockMutableSegmentFileSetWriter)(nil).Reset), arg0) } +// MockFSTSegmentDataFileSetWriter is a mock of FSTSegmentDataFileSetWriter interface +type MockFSTSegmentDataFileSetWriter struct { + ctrl *gomock.Controller + recorder *MockFSTSegmentDataFileSetWriterMockRecorder +} + +// MockFSTSegmentDataFileSetWriterMockRecorder is the mock recorder for MockFSTSegmentDataFileSetWriter +type MockFSTSegmentDataFileSetWriterMockRecorder struct { + mock *MockFSTSegmentDataFileSetWriter +} + +// NewMockFSTSegmentDataFileSetWriter creates a new mock instance +func NewMockFSTSegmentDataFileSetWriter(ctrl *gomock.Controller) *MockFSTSegmentDataFileSetWriter { + mock := &MockFSTSegmentDataFileSetWriter{ctrl: ctrl} + mock.recorder = &MockFSTSegmentDataFileSetWriterMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use +func (m *MockFSTSegmentDataFileSetWriter) EXPECT() *MockFSTSegmentDataFileSetWriterMockRecorder { + return m.recorder +} + +// SegmentType mocks base method +func (m *MockFSTSegmentDataFileSetWriter) SegmentType() IndexSegmentType { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentType") + ret0, _ := ret[0].(IndexSegmentType) + return ret0 +} + +// SegmentType indicates an expected call of SegmentType +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) SegmentType() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentType", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).SegmentType)) +} + +// MajorVersion mocks base method +func (m *MockFSTSegmentDataFileSetWriter) MajorVersion() int { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MajorVersion") + ret0, _ := ret[0].(int) + return ret0 +} + +// MajorVersion indicates an expected call of MajorVersion +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) MajorVersion() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MajorVersion", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).MajorVersion)) +} + +// MinorVersion mocks base method +func (m *MockFSTSegmentDataFileSetWriter) MinorVersion() int { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MinorVersion") + ret0, _ := ret[0].(int) + return ret0 +} + +// MinorVersion indicates an expected call of MinorVersion +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) MinorVersion() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MinorVersion", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).MinorVersion)) +} + +// SegmentMetadata mocks base method +func (m *MockFSTSegmentDataFileSetWriter) SegmentMetadata() []byte { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentMetadata") + ret0, _ := ret[0].([]byte) + return ret0 +} + +// SegmentMetadata indicates an expected call of SegmentMetadata +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) SegmentMetadata() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentMetadata", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).SegmentMetadata)) +} + +// SegmentState mocks base method +func (m *MockFSTSegmentDataFileSetWriter) SegmentState() fst.IndexSegmentState { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SegmentState") + ret0, _ := ret[0].(fst.IndexSegmentState) + return ret0 +} + +// SegmentState indicates an expected call of SegmentState +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) SegmentState() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentState", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).SegmentState)) +} + +// Files mocks base method +func (m *MockFSTSegmentDataFileSetWriter) Files() []IndexSegmentFileType { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Files") + ret0, _ := ret[0].([]IndexSegmentFileType) + return ret0 +} + +// Files indicates an expected call of Files +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) Files() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Files", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).Files)) +} + +// WriteFile mocks base method +func (m *MockFSTSegmentDataFileSetWriter) WriteFile(fileType IndexSegmentFileType, writer io.Writer) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WriteFile", fileType, writer) + ret0, _ := ret[0].(error) + return ret0 +} + +// WriteFile indicates an expected call of WriteFile +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) WriteFile(fileType, writer interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WriteFile", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).WriteFile), fileType, writer) +} + +// Reset mocks base method +func (m *MockFSTSegmentDataFileSetWriter) Reset(arg0 fst.SegmentData) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Reset", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// Reset indicates an expected call of Reset +func (mr *MockFSTSegmentDataFileSetWriterMockRecorder) Reset(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reset", reflect.TypeOf((*MockFSTSegmentDataFileSetWriter)(nil).Reset), arg0) +} + // MockIndexFileSetReader is a mock of IndexFileSetReader interface type MockIndexFileSetReader struct { ctrl *gomock.Controller diff --git a/src/m3ninx/persist/types.go b/src/m3ninx/persist/types.go index ec7eef2192..0ec0329c00 100644 --- a/src/m3ninx/persist/types.go +++ b/src/m3ninx/persist/types.go @@ -26,6 +26,7 @@ import ( "regexp" "github.com/m3db/m3/src/m3ninx/index/segment" + "github.com/m3db/m3/src/m3ninx/index/segment/fst" "github.com/m3db/m3/src/x/mmap" ) @@ -50,6 +51,7 @@ type IndexSegmentFileSetWriter interface { MajorVersion() int MinorVersion() int SegmentMetadata() []byte + SegmentState() fst.IndexSegmentState Files() []IndexSegmentFileType WriteFile(fileType IndexSegmentFileType, writer io.Writer) error } @@ -63,6 +65,15 @@ type MutableSegmentFileSetWriter interface { Reset(segment.Builder) error } +// FSTSegmentDataFileSetWriter is a new IndexSegmentFileSetWriter for writing +// out fst.SegmentData. +type FSTSegmentDataFileSetWriter interface { + IndexSegmentFileSetWriter + + // Reset resets the writer to write the provided segment data. + Reset(fst.SegmentData) error +} + // IndexFileSetReader is an index file set reader, it can read many segments. type IndexFileSetReader interface { // SegmentFileSets returns the number of segment file sets. @@ -105,6 +116,10 @@ const ( // DefaultIndexVolumeType is a default IndexVolumeType. // This is the type if not otherwise specified. DefaultIndexVolumeType IndexVolumeType = "default" + // SnapshotColdIndexVolumeType holds cold index snapshot data. + SnapshotColdIndexVolumeType IndexVolumeType = "snapshot_cold" + // SnapshotWarmIndexVolumeType holds warm index snapshot data. + SnapshotWarmIndexVolumeType IndexVolumeType = "snapshot_warm" ) // IndexSegmentType is the type of an index file set. diff --git a/src/m3ninx/persist/writer.go b/src/m3ninx/persist/writer.go index 23bab30e0a..854f8f5bdd 100644 --- a/src/m3ninx/persist/writer.go +++ b/src/m3ninx/persist/writer.go @@ -78,6 +78,12 @@ func (w *writer) SegmentMetadata() []byte { return w.fsWriter.Metadata() } +func (w *writer) SegmentState() fst.IndexSegmentState { + // NB(bodu): Flushed index segments are considered frozen + // since they are no longer compactable. + return fst.FrozenIndexSegmentState +} + func (w *writer) Files() []IndexSegmentFileType { // NB(prateek): order is important here. It is the order of files written out, // and needs to be maintained as it is below. @@ -108,20 +114,13 @@ func (w *writer) WriteFile(fileType IndexSegmentFileType, iow io.Writer) error { // NewFSTSegmentDataFileSetWriter creates a new file set writer for // fst segment data. -func NewFSTSegmentDataFileSetWriter( - data fst.SegmentData, -) (IndexSegmentFileSetWriter, error) { - if err := data.Validate(); err != nil { - return nil, err - } - +func NewFSTSegmentDataFileSetWriter() (FSTSegmentDataFileSetWriter, error) { docsWriter, err := fst.NewDocumentsWriter() if err != nil { return nil, err } return &fstSegmentDataWriter{ - data: data, docsWriter: docsWriter, }, nil } @@ -130,12 +129,25 @@ type fstSegmentDataWriter struct { data fst.SegmentData docsWriter *fst.DocumentsWriter docsDataFileWritten bool + segmentState fst.IndexSegmentState +} + +func (w *fstSegmentDataWriter) Reset(data fst.SegmentData) error { + if err := data.Validate(); err != nil { + return err + } + w.data = data + return nil } func (w *fstSegmentDataWriter) SegmentType() IndexSegmentType { return FSTIndexSegmentType } +func (w *fstSegmentDataWriter) SegmentState() fst.IndexSegmentState { + return w.data.State +} + func (w *fstSegmentDataWriter) MajorVersion() int { return w.data.Version.Major } diff --git a/src/m3ninx/search/search_mock.go b/src/m3ninx/search/search_mock.go index 3f2e886417..3c084ab3e3 100644 --- a/src/m3ninx/search/search_mock.go +++ b/src/m3ninx/search/search_mock.go @@ -1,7 +1,7 @@ // Code generated by MockGen. DO NOT EDIT. // Source: github.com/m3db/m3/src/m3ninx/search/types.go -// Copyright (c) 2018 Uber Technologies, Inc. +// Copyright (c) 2020 Uber Technologies, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal