-
Notifications
You must be signed in to change notification settings - Fork 304
feat(partitions): add consensus per partition and extra #3071
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
df6f175
3608257
4a886c0
598c549
ed77db5
4759467
b029988
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ use crate::journal::{ | |
| MessageLookup, PartitionJournal, PartitionJournalMemStorage, QueryableJournal, | ||
| }; | ||
| use crate::log::SegmentedLog; | ||
| use crate::offset_storage::{delete_persisted_offset, persist_offset}; | ||
| use crate::{ | ||
| AppendResult, Partition, PartitionOffsets, PollFragments, PollQueryResult, PollingArgs, | ||
| PollingConsumer, | ||
|
|
@@ -30,6 +31,7 @@ use iggy_common::{ | |
| send_messages2::stamp_prepare_for_persistence, | ||
| }; | ||
| use journal::Journal as _; | ||
| use std::collections::HashMap; | ||
| use std::sync::Arc; | ||
| use std::sync::atomic::{AtomicU64, Ordering}; | ||
| use tokio::sync::Mutex as TokioMutex; | ||
|
|
@@ -52,6 +54,57 @@ pub struct IggyPartition { | |
| pub revision_id: u64, | ||
| pub should_increment_offset: bool, | ||
| pub write_lock: Arc<TokioMutex<()>>, | ||
| consumer_offsets_path: Option<String>, | ||
| consumer_group_offsets_path: Option<String>, | ||
| pending_consumer_offset_commits: HashMap<u64, PendingConsumerOffsetCommit>, | ||
| } | ||
|
|
||
| #[derive(Debug, Clone, Copy, PartialEq)] | ||
| struct PendingConsumerOffsetCommit { | ||
| kind: ConsumerKind, | ||
| consumer_id: u32, | ||
| mutation: PendingConsumerOffsetMutation, | ||
| } | ||
|
|
||
| #[derive(Debug, Clone, Copy, PartialEq)] | ||
| enum PendingConsumerOffsetMutation { | ||
| Upsert(u64), | ||
| Delete, | ||
| } | ||
|
|
||
| impl PendingConsumerOffsetCommit { | ||
| const fn upsert(kind: ConsumerKind, consumer_id: u32, offset: u64) -> Self { | ||
| Self { | ||
| kind, | ||
| consumer_id, | ||
| mutation: PendingConsumerOffsetMutation::Upsert(offset), | ||
| } | ||
| } | ||
|
|
||
| const fn delete(kind: ConsumerKind, consumer_id: u32) -> Self { | ||
| Self { | ||
| kind, | ||
| consumer_id, | ||
| mutation: PendingConsumerOffsetMutation::Delete, | ||
| } | ||
| } | ||
|
|
||
| fn try_from_polling_consumer( | ||
| consumer: PollingConsumer, | ||
| offset: u64, | ||
| ) -> Result<Self, IggyError> { | ||
| let (kind, consumer_id) = match consumer { | ||
| PollingConsumer::Consumer(id, _) => ( | ||
| ConsumerKind::Consumer, | ||
| u32::try_from(id).map_err(|_| IggyError::InvalidCommand)?, | ||
| ), | ||
| PollingConsumer::ConsumerGroup(group_id, _) => ( | ||
| ConsumerKind::ConsumerGroup, | ||
| u32::try_from(group_id).map_err(|_| IggyError::InvalidCommand)?, | ||
| ), | ||
| }; | ||
| Ok(Self::upsert(kind, consumer_id, offset)) | ||
| } | ||
| } | ||
|
|
||
| impl IggyPartition { | ||
|
|
@@ -67,6 +120,158 @@ impl IggyPartition { | |
| revision_id: 0, | ||
| should_increment_offset: false, | ||
| write_lock: Arc::new(TokioMutex::new(())), | ||
| consumer_offsets_path: None, | ||
| consumer_group_offsets_path: None, | ||
| pending_consumer_offset_commits: HashMap::new(), | ||
| } | ||
| } | ||
|
|
||
| pub fn configure_consumer_offset_storage( | ||
| &mut self, | ||
| consumer_offsets_path: String, | ||
| consumer_group_offsets_path: String, | ||
| consumer_offsets: ConsumerOffsets, | ||
| consumer_group_offsets: ConsumerGroupOffsets, | ||
| ) { | ||
| self.consumer_offsets = Arc::new(consumer_offsets); | ||
| self.consumer_group_offsets = Arc::new(consumer_group_offsets); | ||
| self.consumer_offsets_path = Some(consumer_offsets_path); | ||
| self.consumer_group_offsets_path = Some(consumer_group_offsets_path); | ||
| } | ||
|
|
||
| pub(crate) async fn persist_and_stage_consumer_offset_upsert( | ||
| &mut self, | ||
| op: u64, | ||
| kind: ConsumerKind, | ||
| consumer_id: u32, | ||
| offset: u64, | ||
| ) -> Result<(), IggyError> { | ||
| let pending = PendingConsumerOffsetCommit::upsert(kind, consumer_id, offset); | ||
| self.persist_consumer_offset_commit(pending).await?; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. consumer offsets are persisted to disk here during the prepare phase (before quorum), unlike if the prepare never commits (leader crash, view change), disk has uncommitted values with no rollback mechanism. on restart, the uncommitted offset file is loaded as if it were committed. recommended fix: move the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the behavior of treating written offset as |
||
| self.pending_consumer_offset_commits.insert(op, pending); | ||
| Ok(()) | ||
| } | ||
|
|
||
| pub(crate) async fn persist_and_stage_consumer_offset_delete( | ||
| &mut self, | ||
| op: u64, | ||
| kind: ConsumerKind, | ||
| consumer_id: u32, | ||
| ) -> Result<(), IggyError> { | ||
| let pending = PendingConsumerOffsetCommit::delete(kind, consumer_id); | ||
| self.persist_consumer_offset_commit(pending).await?; | ||
| self.pending_consumer_offset_commits.insert(op, pending); | ||
| Ok(()) | ||
| } | ||
|
|
||
| pub(crate) fn apply_staged_consumer_offset_commit(&mut self, op: u64) -> Result<(), IggyError> { | ||
| let pending = self | ||
| .pending_consumer_offset_commits | ||
| .remove(&op) | ||
| .ok_or(IggyError::InvalidCommand)?; | ||
| self.apply_consumer_offset_commit(pending); | ||
| Ok(()) | ||
| } | ||
|
|
||
| async fn persist_consumer_offset_commit( | ||
| &self, | ||
| pending: PendingConsumerOffsetCommit, | ||
| ) -> Result<(), IggyError> { | ||
| let Some(path) = self.persisted_offset_path(pending.kind, pending.consumer_id) else { | ||
| return Ok(()); | ||
| }; | ||
| match pending.mutation { | ||
| PendingConsumerOffsetMutation::Upsert(offset) => persist_offset(&path, offset).await, | ||
| PendingConsumerOffsetMutation::Delete => delete_persisted_offset(&path).await, | ||
| } | ||
| } | ||
|
|
||
| fn apply_consumer_offset_commit(&self, pending: PendingConsumerOffsetCommit) { | ||
| match pending.mutation { | ||
| PendingConsumerOffsetMutation::Upsert(offset) | ||
| if pending.kind == ConsumerKind::Consumer => | ||
| { | ||
| let id = pending.consumer_id; | ||
| let guard = self.consumer_offsets.pin(); | ||
| let key = usize::try_from(id).expect("u32 consumer id must fit usize"); | ||
| if let Some(existing) = guard.get(&key) { | ||
| existing.offset.store(offset, Ordering::Relaxed); | ||
| } else { | ||
| let created = self.consumer_offsets_path.as_deref().map_or_else( | ||
| || ConsumerOffset::new(ConsumerKind::Consumer, id, 0, String::new()), | ||
| |path| ConsumerOffset::default_for_consumer(id, path), | ||
| ); | ||
| created.offset.store(offset, Ordering::Relaxed); | ||
| guard.insert(key, created); | ||
| } | ||
| } | ||
| PendingConsumerOffsetMutation::Upsert(offset) | ||
| if pending.kind == ConsumerKind::ConsumerGroup => | ||
| { | ||
| let group_id = pending.consumer_id; | ||
| let guard = self.consumer_group_offsets.pin(); | ||
| let key = ConsumerGroupId( | ||
| usize::try_from(group_id).expect("u32 group id must fit usize"), | ||
| ); | ||
| if let Some(existing) = guard.get(&key) { | ||
| existing.offset.store(offset, Ordering::Relaxed); | ||
| } else { | ||
| let created = self.consumer_group_offsets_path.as_deref().map_or_else( | ||
| || { | ||
| ConsumerOffset::new( | ||
| ConsumerKind::ConsumerGroup, | ||
| group_id, | ||
| 0, | ||
| String::new(), | ||
| ) | ||
| }, | ||
| |path| ConsumerOffset::default_for_consumer_group(key, path), | ||
| ); | ||
| created.offset.store(offset, Ordering::Relaxed); | ||
| guard.insert(key, created); | ||
| } | ||
| } | ||
| PendingConsumerOffsetMutation::Delete if pending.kind == ConsumerKind::Consumer => { | ||
| let id = pending.consumer_id; | ||
| let guard = self.consumer_offsets.pin(); | ||
| let key = usize::try_from(id).expect("u32 consumer id must fit usize"); | ||
| let _ = guard.remove(&key); | ||
| } | ||
| PendingConsumerOffsetMutation::Delete | ||
| if pending.kind == ConsumerKind::ConsumerGroup => | ||
| { | ||
| let group_id = pending.consumer_id; | ||
| let guard = self.consumer_group_offsets.pin(); | ||
| let key = ConsumerGroupId( | ||
| usize::try_from(group_id).expect("u32 group id must fit usize"), | ||
| ); | ||
| let _ = guard.remove(&key); | ||
| } | ||
| _ => {} | ||
numinnex marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| async fn store_consumer_offset_and_persist( | ||
| &self, | ||
| consumer: PollingConsumer, | ||
| offset: u64, | ||
| ) -> Result<(), IggyError> { | ||
| let pending = PendingConsumerOffsetCommit::try_from_polling_consumer(consumer, offset)?; | ||
| self.persist_consumer_offset_commit(pending).await?; | ||
| self.apply_consumer_offset_commit(pending); | ||
| Ok(()) | ||
| } | ||
|
|
||
| fn persisted_offset_path(&self, kind: ConsumerKind, consumer_id: u32) -> Option<String> { | ||
| match kind { | ||
| ConsumerKind::Consumer => self | ||
| .consumer_offsets_path | ||
| .as_ref() | ||
| .map(|path| format!("{path}/{consumer_id}")), | ||
| ConsumerKind::ConsumerGroup => self | ||
| .consumer_group_offsets_path | ||
| .as_ref() | ||
| .map(|path| format!("{path}/{consumer_id}")), | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -191,7 +396,10 @@ impl Partition for IggyPartition { | |
| if args.auto_commit && !fragments.is_empty() { | ||
| let last_offset = | ||
| last_matching_offset.expect("non-empty poll result must have a last offset"); | ||
| if let Err(err) = self.store_consumer_offset(consumer, last_offset) { | ||
| if let Err(err) = self | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
on failover, the new leader has no record of auto-committed offsets since they were never replicated. this is a pre-existing pattern, but now inconsistent since explicit
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think for auto commit polls, at the end of the Additionally I think we should expand the |
||
| .store_consumer_offset_and_persist(consumer, last_offset) | ||
| .await | ||
| { | ||
| // warning for now. | ||
| warn!( | ||
| target: "iggy.partitions.diag", | ||
|
|
@@ -212,41 +420,8 @@ impl Partition for IggyPartition { | |
| consumer: PollingConsumer, | ||
| offset: u64, | ||
| ) -> Result<(), IggyError> { | ||
| match consumer { | ||
| PollingConsumer::Consumer(id, _) => { | ||
| let guard = self.consumer_offsets.pin(); | ||
| if let Some(existing) = guard.get(&id) { | ||
| existing.offset.store(offset, Ordering::Relaxed); | ||
| } else { | ||
| guard.insert( | ||
| id, | ||
| ConsumerOffset::new( | ||
| ConsumerKind::Consumer, | ||
| id as u32, | ||
| offset, | ||
| String::new(), | ||
| ), | ||
| ); | ||
| } | ||
| } | ||
| PollingConsumer::ConsumerGroup(group_id, _) => { | ||
| let guard = self.consumer_group_offsets.pin(); | ||
| let key = ConsumerGroupId(group_id); | ||
| if let Some(existing) = guard.get(&key) { | ||
| existing.offset.store(offset, Ordering::Relaxed); | ||
| } else { | ||
| guard.insert( | ||
| key, | ||
| ConsumerOffset::new( | ||
| ConsumerKind::ConsumerGroup, | ||
| group_id as u32, | ||
| offset, | ||
| String::new(), | ||
| ), | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| let pending = PendingConsumerOffsetCommit::try_from_polling_consumer(consumer, offset)?; | ||
| self.apply_consumer_offset_commit(pending); | ||
| Ok(()) | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.