Skip to content

Commit

Permalink
KAFKA-14462; [14/N] Add PartitionWriter (apache#13675)
Browse files Browse the repository at this point in the history
This patch introduces the `PartitionWriter` interface in the `group-coordinator` module. The `ReplicaManager` resides in the `core` module and it is thus not accessible from the `group-coordinator` one. The `CoordinatorPartitionWriter` is basically an implementation of the interface residing in `core` which interfaces with the `ReplicaManager`.

One notable difference from the usual produce path is that the `PartitionWriter` returns the offset following the written records. This is then used by the coordinator runtime to track when the request associated with the write can be completed.

Reviewers: Jeff Kim <[email protected]>, Justine Olshan <[email protected]>
  • Loading branch information
dajac committed Jun 6, 2023
1 parent c8cb852 commit 7d147cf
Show file tree
Hide file tree
Showing 13 changed files with 755 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ public Map<Errors, Integer> errorCounts() {
public static final class PartitionResponse {
public Errors error;
public long baseOffset;
public long lastOffset;
public long logAppendTime;
public long logStartOffset;
public List<RecordError> recordErrors;
Expand All @@ -153,8 +154,21 @@ public PartitionResponse(Errors error, long baseOffset, long logAppendTime, long
}

public PartitionResponse(Errors error, long baseOffset, long logAppendTime, long logStartOffset, List<RecordError> recordErrors, String errorMessage) {
this(error, baseOffset, INVALID_OFFSET, logAppendTime, logStartOffset, recordErrors, errorMessage);
}

public PartitionResponse(
Errors error,
long baseOffset,
long lastOffset,
long logAppendTime,
long logStartOffset,
List<RecordError> recordErrors,
String errorMessage
) {
this.error = error;
this.baseOffset = baseOffset;
this.lastOffset = lastOffset;
this.logAppendTime = logAppendTime;
this.logStartOffset = logStartOffset;
this.recordErrors = recordErrors;
Expand All @@ -167,6 +181,7 @@ public boolean equals(Object o) {
if (o == null || getClass() != o.getClass()) return false;
PartitionResponse that = (PartitionResponse) o;
return baseOffset == that.baseOffset &&
lastOffset == that.lastOffset &&
logAppendTime == that.logAppendTime &&
logStartOffset == that.logStartOffset &&
error == that.error &&
Expand All @@ -176,7 +191,7 @@ public boolean equals(Object o) {

@Override
public int hashCode() {
return Objects.hash(error, baseOffset, logAppendTime, logStartOffset, recordErrors, errorMessage);
return Objects.hash(error, baseOffset, lastOffset, logAppendTime, logStartOffset, recordErrors, errorMessage);
}

@Override
Expand All @@ -187,6 +202,8 @@ public String toString() {
b.append(error);
b.append(",offset: ");
b.append(baseOffset);
b.append(",lastOffset: ");
b.append(lastOffset);
b.append(",logAppendTime: ");
b.append(logAppendTime);
b.append(", logStartOffset: ");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http:https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package kafka.coordinator.group

import kafka.cluster.PartitionListener
import kafka.server.{ActionQueue, ReplicaManager}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.errors.RecordTooLargeException
import org.apache.kafka.common.protocol.Errors
import org.apache.kafka.common.record.{CompressionType, MemoryRecords, TimestampType}
import org.apache.kafka.common.record.Record.EMPTY_HEADERS
import org.apache.kafka.common.requests.ProduceResponse.PartitionResponse
import org.apache.kafka.common.utils.Time
import org.apache.kafka.coordinator.group.runtime.PartitionWriter
import org.apache.kafka.storage.internals.log.AppendOrigin

import java.nio.ByteBuffer
import java.util
import scala.collection.Map

/**
* ListenerAdapter adapts the PartitionListener interface to the
* PartitionWriter.Listener interface.
*/
private[group] class ListenerAdapter(
val listener: PartitionWriter.Listener
) extends PartitionListener {
override def onHighWatermarkUpdated(
tp: TopicPartition,
offset: Long
): Unit = {
listener.onHighWatermarkUpdated(tp, offset)
}

override def equals(that: Any): Boolean = that match {
case other: ListenerAdapter => listener.equals(other.listener)
case _ => false
}

override def hashCode(): Int = {
listener.hashCode()
}

override def toString: String = {
s"ListenerAdapter(listener=$listener)"
}
}

class CoordinatorPartitionWriter[T](
replicaManager: ReplicaManager,
serializer: PartitionWriter.Serializer[T],
compressionType: CompressionType,
time: Time
) extends PartitionWriter[T] {
// We use an action queue which directly executes actions. This is possible
// here because we don't hold any conflicting locks.
private val directActionQueue = new ActionQueue {
override def add(action: () => Unit): Unit = {
action()
}

override def tryCompleteActions(): Unit = {}
}

/**
* Register a PartitionWriter.Listener.
*
* @param tp The partition to register the listener to.
* @param listener The listener.
*/
override def registerListener(
tp: TopicPartition,
listener: PartitionWriter.Listener
): Unit = {
replicaManager.maybeAddListener(tp, new ListenerAdapter(listener))
}

/**
* Deregister a PartitionWriter.Listener.
*
* @param tp The partition to deregister the listener from.
* @param listener The listener.
*/
override def deregisterListener(
tp: TopicPartition,
listener: PartitionWriter.Listener
): Unit = {
replicaManager.removeListener(tp, new ListenerAdapter(listener))
}

/**
* Write records to the partitions. Records are written in one batch so
* atomicity is guaranteed.
*
* @param tp The partition to write records to.
* @param records The list of records. The records are written in a single batch.
* @return The log end offset right after the written records.
* @throws KafkaException Any KafkaException caught during the write operation.
*/
override def append(
tp: TopicPartition,
records: util.List[T]
): Long = {
if (records.isEmpty) throw new IllegalStateException("records must be non-empty.")

replicaManager.getLogConfig(tp) match {
case Some(logConfig) =>
val magic = logConfig.recordVersion.value
val maxBatchSize = logConfig.maxMessageSize
val currentTimeMs = time.milliseconds()

val recordsBuilder = MemoryRecords.builder(
ByteBuffer.allocate(math.min(16384, maxBatchSize)),
magic,
compressionType,
TimestampType.CREATE_TIME,
0L,
maxBatchSize
)

records.forEach { record =>
val keyBytes = serializer.serializeKey(record)
val valueBytes = serializer.serializeValue(record)

if (recordsBuilder.hasRoomFor(currentTimeMs, keyBytes, valueBytes, EMPTY_HEADERS)) recordsBuilder.append(
currentTimeMs,
keyBytes,
valueBytes,
EMPTY_HEADERS
) else throw new RecordTooLargeException(s"Message batch size is ${recordsBuilder.estimatedSizeInBytes()} bytes " +
s"in append to partition $tp which exceeds the maximum configured size of $maxBatchSize.")
}

var appendResults: Map[TopicPartition, PartitionResponse] = Map.empty
replicaManager.appendRecords(
timeout = 0L,
requiredAcks = 1,
internalTopicsAllowed = true,
origin = AppendOrigin.COORDINATOR,
entriesPerPartition = Map(tp -> recordsBuilder.build()),
responseCallback = results => appendResults = results,
// We can directly complete the purgatories here because we don't hold
// any conflicting locks.
actionQueue = directActionQueue
)

val partitionResult = appendResults.getOrElse(tp,
throw new IllegalStateException(s"Append status $appendResults should have partition $tp."))

if (partitionResult.error != Errors.NONE) {
throw partitionResult.error.exception()
}

// Required offset.
partitionResult.lastOffset + 1

case None =>
throw Errors.NOT_LEADER_OR_FOLLOWER.exception()
}
}
}
22 changes: 16 additions & 6 deletions core/src/main/scala/kafka/server/ActionQueue.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,32 @@ import java.util.concurrent.ConcurrentLinkedQueue
import kafka.utils.Logging

/**
* This queue is used to collect actions which need to be executed later. One use case is that ReplicaManager#appendRecords
* produces record changes so we need to check and complete delayed requests. In order to avoid conflicting locking,
* we add those actions to this queue and then complete them at the end of KafkaApis.handle() or DelayedJoin.onExpiration.
* The action queue is used to collect actions which need to be executed later.
*/
class ActionQueue extends Logging {
private val queue = new ConcurrentLinkedQueue[() => Unit]()
trait ActionQueue {

/**
* add action to this queue.
* @param action action
*/
def add(action: () => Unit): Unit = queue.add(action)
def add(action: () => Unit): Unit

/**
* try to complete all delayed actions
*/
def tryCompleteActions(): Unit
}

/**
* This queue is used to collect actions which need to be executed later. One use case is that ReplicaManager#appendRecords
* produces record changes so we need to check and complete delayed requests. In order to avoid conflicting locking,
* we add those actions to this queue and then complete them at the end of KafkaApis.handle() or DelayedJoin.onExpiration.
*/
class DelayedActionQueue extends Logging with ActionQueue {
private val queue = new ConcurrentLinkedQueue[() => Unit]()

def add(action: () => Unit): Unit = queue.add(action)

def tryCompleteActions(): Unit = {
val maxToComplete = queue.size()
var count = 0
Expand Down
38 changes: 19 additions & 19 deletions core/src/main/scala/kafka/server/ReplicaManager.scala
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ class ReplicaManager(val config: KafkaConfig,
/**
* TODO: move this action queue to handle thread so we can simplify concurrency handling
*/
private val actionQueue = new ActionQueue
private val actionQueue = new DelayedActionQueue

def tryCompleteActions(): Unit = actionQueue.tryCompleteActions()

Expand All @@ -655,6 +655,7 @@ class ReplicaManager(val config: KafkaConfig,
* @param requestLocal container for the stateful instances scoped to this request
* @param transactionalId transactional ID if the request is from a producer and the producer is transactional
* @param transactionStatePartition partition that holds the transactional state if transactionalId is present
* @param actionQueue the action queue to use. ReplicaManager#actionQueue is used by default.
*/
def appendRecords(timeout: Long,
requiredAcks: Short,
Expand All @@ -666,7 +667,8 @@ class ReplicaManager(val config: KafkaConfig,
recordConversionStatsCallback: Map[TopicPartition, RecordConversionStats] => Unit = _ => (),
requestLocal: RequestLocal = RequestLocal.NoCaching,
transactionalId: String = null,
transactionStatePartition: Option[Int] = None): Unit = {
transactionStatePartition: Option[Int] = None,
actionQueue: ActionQueue = this.actionQueue): Unit = {
if (isValidRequiredAcks(requiredAcks)) {
val sTime = time.milliseconds

Expand Down Expand Up @@ -722,6 +724,7 @@ class ReplicaManager(val config: KafkaConfig,
new PartitionResponse(
result.error,
result.info.firstOffset.map[Long](_.messageOffset).orElse(-1L),
result.info.lastOffset,
result.info.logAppendTime,
result.info.logStartOffset,
result.info.recordErrors,
Expand All @@ -731,23 +734,21 @@ class ReplicaManager(val config: KafkaConfig,
}

actionQueue.add {
() =>
allResults.foreach {
case (topicPartition, result) =>
val requestKey = TopicPartitionOperationKey(topicPartition)
result.info.leaderHwChange match {
case LeaderHwChange.INCREASED =>
// some delayed operations may be unblocked after HW changed
delayedProducePurgatory.checkAndComplete(requestKey)
delayedFetchPurgatory.checkAndComplete(requestKey)
delayedDeleteRecordsPurgatory.checkAndComplete(requestKey)
case LeaderHwChange.SAME =>
// probably unblock some follower fetch requests since log end offset has been updated
delayedFetchPurgatory.checkAndComplete(requestKey)
case LeaderHwChange.NONE =>
// nothing
}
() => allResults.foreach { case (topicPartition, result) =>
val requestKey = TopicPartitionOperationKey(topicPartition)
result.info.leaderHwChange match {
case LeaderHwChange.INCREASED =>
// some delayed operations may be unblocked after HW changed
delayedProducePurgatory.checkAndComplete(requestKey)
delayedFetchPurgatory.checkAndComplete(requestKey)
delayedDeleteRecordsPurgatory.checkAndComplete(requestKey)
case LeaderHwChange.SAME =>
// probably unblock some follower fetch requests since log end offset has been updated
delayedFetchPurgatory.checkAndComplete(requestKey)
case LeaderHwChange.NONE =>
// nothing
}
}
}

recordConversionStatsCallback(localProduceResults.map { case (k, v) => k -> v.info.recordConversionStats })
Expand All @@ -764,7 +765,6 @@ class ReplicaManager(val config: KafkaConfig,
// this is because while the delayed produce operation is being created, new
// requests may arrive and hence make this operation completable.
delayedProducePurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys)

} else {
// we can respond immediately
val produceResponseStatus = produceStatus.map { case (k, status) => k -> status.responseStatus }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ object AbstractCoordinatorConcurrencyTest {
processingStatsCallback: Map[TopicPartition, RecordConversionStats] => Unit = _ => (),
requestLocal: RequestLocal = RequestLocal.NoCaching,
transactionalId: String = null,
transactionStatePartition: Option[Int]): Unit = {
transactionStatePartition: Option[Int],
actionQueue: ActionQueue = null): Unit = {

if (entriesPerPartition.isEmpty)
return
Expand Down
Loading

0 comments on commit 7d147cf

Please sign in to comment.