2022-04-19 11:21:11 -04:00
|
|
|
#include <haproxy/quic_stream.h>
|
|
|
|
|
|
|
|
|
|
#include <import/eb64tree.h>
|
|
|
|
|
|
|
|
|
|
#include <haproxy/api.h>
|
|
|
|
|
#include <haproxy/buf.h>
|
|
|
|
|
#include <haproxy/dynbuf.h>
|
2024-08-13 03:34:28 -04:00
|
|
|
#include <haproxy/errors.h>
|
2024-01-17 09:15:55 -05:00
|
|
|
#include <haproxy/mux_quic.h>
|
2022-04-19 11:21:11 -04:00
|
|
|
#include <haproxy/pool.h>
|
2022-09-30 12:11:13 -04:00
|
|
|
#include <haproxy/quic_conn.h>
|
2025-05-07 11:32:46 -04:00
|
|
|
#include <haproxy/quic_utils.h>
|
2022-09-30 11:44:15 -04:00
|
|
|
#include <haproxy/task.h>
|
2022-04-19 11:21:11 -04:00
|
|
|
|
MEDIUM: tree-wide: replace most DECLARE_POOL with DECLARE_TYPED_POOL
This will make the pools size and alignment automatically inherit
the type declaration. It was done like this:
sed -i -e 's:DECLARE_POOL(\([^,]*,[^,]*,\s*\)sizeof(\([^)]*\))):DECLARE_TYPED_POOL(\1\2):g' $(git grep -lw DECLARE_POOL src addons)
sed -i -e 's:DECLARE_STATIC_POOL(\([^,]*,[^,]*,\s*\)sizeof(\([^)]*\))):DECLARE_STATIC_TYPED_POOL(\1\2):g' $(git grep -lw DECLARE_STATIC_POOL src addons)
81 replacements were made. The only remaining ones are those which set
their own size without depending on a structure. The few ones with an
extra size were manually handled.
It also means that the requested alignments are now checked against the
type's. Given that none is specified for now, no issue is reported.
It was verified with "show pools detailed" that the definitions are
exactly the same, and that the binaries are similar.
2025-08-06 10:43:27 -04:00
|
|
|
DECLARE_STATIC_TYPED_POOL(pool_head_quic_stream_desc, "qc_stream_desc", struct qc_stream_desc);
|
|
|
|
|
DECLARE_STATIC_TYPED_POOL(pool_head_quic_stream_buf, "qc_stream_buf", struct qc_stream_buf);
|
|
|
|
|
DECLARE_STATIC_TYPED_POOL(pool_head_quic_stream_ack, "qc_stream_ack", struct qc_stream_ack);
|
2022-04-15 11:29:25 -04:00
|
|
|
|
2024-01-26 08:30:16 -05:00
|
|
|
static void qc_stream_buf_free(struct qc_stream_desc *stream,
|
|
|
|
|
struct qc_stream_buf **stream_buf)
|
|
|
|
|
{
|
|
|
|
|
struct buffer *buf = &(*stream_buf)->buf;
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
uint64_t room;
|
2024-01-26 08:30:16 -05:00
|
|
|
|
2024-10-01 05:13:41 -04:00
|
|
|
/* Caller is responsible to remove buffered ACK frames before destroying a buffer instance. */
|
2024-10-01 11:34:55 -04:00
|
|
|
BUG_ON(!eb_is_empty(&(*stream_buf)->ack_tree));
|
2024-10-01 05:13:41 -04:00
|
|
|
|
2024-10-01 05:27:37 -04:00
|
|
|
eb64_delete(&(*stream_buf)->offset_node);
|
2024-01-29 03:18:08 -05:00
|
|
|
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
if (*stream_buf == stream->buf) {
|
|
|
|
|
/* Reset current buffer ptr. */
|
2024-01-26 08:30:16 -05:00
|
|
|
stream->buf = NULL;
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
room = b_size(buf);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* For released buffer, acked data were already notified. */
|
|
|
|
|
room = b_data(buf);
|
|
|
|
|
}
|
2024-01-26 08:30:16 -05:00
|
|
|
|
2026-03-09 02:38:22 -04:00
|
|
|
b_free(buf);
|
|
|
|
|
if (!(*stream_buf)->sbuf) {
|
2025-05-07 11:32:46 -04:00
|
|
|
bdata_ctr_del(&stream->data, b_data(buf));
|
|
|
|
|
bdata_ctr_bdec(&stream->data);
|
2024-06-13 09:26:51 -04:00
|
|
|
offer_buffers(NULL, 1);
|
|
|
|
|
}
|
2024-01-26 08:30:16 -05:00
|
|
|
pool_free(pool_head_quic_stream_buf, *stream_buf);
|
|
|
|
|
*stream_buf = NULL;
|
|
|
|
|
|
|
|
|
|
/* notify MUX about available buffers. */
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
if (stream->notify_room && room)
|
|
|
|
|
stream->notify_room(stream, room);
|
2024-01-26 08:30:16 -05:00
|
|
|
}
|
|
|
|
|
|
2022-04-19 11:21:11 -04:00
|
|
|
/* Allocate a new stream descriptor with id <id>. The caller is responsible to
|
2022-09-09 12:05:45 -04:00
|
|
|
* store the stream in the appropriate tree. -1 special value must be used for
|
|
|
|
|
* a CRYPTO data stream, the type being ignored.
|
2022-04-19 11:21:11 -04:00
|
|
|
*
|
|
|
|
|
* Returns the newly allocated instance on success or else NULL.
|
|
|
|
|
*/
|
2022-05-02 12:46:58 -04:00
|
|
|
struct qc_stream_desc *qc_stream_desc_new(uint64_t id, enum qcs_type type, void *ctx,
|
2022-04-19 11:59:50 -04:00
|
|
|
struct quic_conn *qc)
|
2022-04-19 11:21:11 -04:00
|
|
|
{
|
|
|
|
|
struct qc_stream_desc *stream;
|
|
|
|
|
|
2022-05-27 03:11:02 -04:00
|
|
|
stream = pool_alloc(pool_head_quic_stream_desc);
|
2022-04-19 11:21:11 -04:00
|
|
|
if (!stream)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
2022-09-09 12:05:45 -04:00
|
|
|
if (id == (uint64_t)-1) {
|
|
|
|
|
stream->by_id.key = (uint64_t)-1;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
stream->by_id.key = id;
|
|
|
|
|
eb64_insert(&qc->streams_by_id, &stream->by_id);
|
|
|
|
|
}
|
2022-04-21 05:00:41 -04:00
|
|
|
stream->qc = qc;
|
2022-04-19 11:21:11 -04:00
|
|
|
|
2022-04-15 11:29:25 -04:00
|
|
|
stream->buf = NULL;
|
2024-10-01 05:27:37 -04:00
|
|
|
stream->buf_tree = EB_ROOT_UNIQUE;
|
2022-04-15 11:29:25 -04:00
|
|
|
stream->buf_offset = 0;
|
2025-05-07 11:32:46 -04:00
|
|
|
bdata_ctr_init(&stream->data);
|
2022-04-15 11:29:25 -04:00
|
|
|
|
2025-05-09 11:30:27 -04:00
|
|
|
stream->origin_ts = now_ns;
|
2022-04-19 11:21:11 -04:00
|
|
|
stream->ack_offset = 0;
|
2024-08-05 12:52:27 -04:00
|
|
|
stream->flags = 0;
|
2022-04-19 11:21:11 -04:00
|
|
|
stream->ctx = ctx;
|
2024-09-25 11:55:10 -04:00
|
|
|
stream->notify_send = NULL;
|
2024-09-25 12:25:08 -04:00
|
|
|
stream->notify_room = NULL;
|
2022-04-19 11:21:11 -04:00
|
|
|
|
|
|
|
|
return stream;
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-19 11:59:50 -04:00
|
|
|
/* Mark the stream descriptor <stream> as released. It will be freed as soon as
|
2024-09-30 08:39:15 -04:00
|
|
|
* all its buffered data are acknowledged.
|
BUG/MEDIUM: quic: remove unsent data from qc_stream_desc buf
QCS instances use qc_stream_desc for data buffering on emission. On
stream reset, its Tx channel is closed earlier than expected. This may
leave unsent data into qc_stream_desc.
Before this patch, these unsent data would remain after QCS freeing.
This prevents the buffer to be released as no ACK reception will remove
them. The buffer is only freed when the whole connection is closed. As
qc_stream_desc buffer is limited per connection, this reduces the buffer
pool for other streams of the same connection. In the worst case if
several streams are resetted, this may completely freeze the transfer of
the remaining connection streams.
This bug was reproduced by reducing the connection buffer pool to a
single buffer instance by using the following global statement :
tune.quic.frontend.conn-tx-buffers.limit 1.
Then a QUIC client is used which opens a stream for a large enough
object to ensure data are buffered. The client them emits a STOP_SENDING
before reading all data, which forces the corresponding QCS instance to
be resetted. The client then opens a new request but the transfer is
freezed due to this bug.
To fix this, adjust qc_stream_desc API. Add a new argument <final_size>
on qc_stream_desc_release() function. Its value is compared to the
currently buffered offset in latest qc_stream_desc buffer. If
<final_size> is inferior, it means unsent data are present in the
buffer. As such, qc_stream_desc_release() removes them to ensure the
buffer will finally be freed when all ACKs are received. It is also
possible that no data remains immediately, indicating that ACK were
already received. As such, buffer instance is immediately removed by
qc_stream_buf_free().
This must be backported up to 2.6. As this code section is known to
regression, a period of observation could be reserved before
distributing it on LTS releases.
2024-01-26 08:41:04 -05:00
|
|
|
*
|
|
|
|
|
* <final_size> corresponds to the last offset sent for this stream. If there
|
|
|
|
|
* is unsent data present, they will be remove first to guarantee that buffer
|
|
|
|
|
* is freed after receiving all acknowledges.
|
2024-09-25 12:25:08 -04:00
|
|
|
*
|
|
|
|
|
* It is expected that upper layer instance related to <stream> may disappear
|
|
|
|
|
* after this operation. As such, <new_ctx> must be set to reassociate <stream>
|
|
|
|
|
* for notifications.
|
2022-04-19 11:21:11 -04:00
|
|
|
*/
|
BUG/MEDIUM: quic: remove unsent data from qc_stream_desc buf
QCS instances use qc_stream_desc for data buffering on emission. On
stream reset, its Tx channel is closed earlier than expected. This may
leave unsent data into qc_stream_desc.
Before this patch, these unsent data would remain after QCS freeing.
This prevents the buffer to be released as no ACK reception will remove
them. The buffer is only freed when the whole connection is closed. As
qc_stream_desc buffer is limited per connection, this reduces the buffer
pool for other streams of the same connection. In the worst case if
several streams are resetted, this may completely freeze the transfer of
the remaining connection streams.
This bug was reproduced by reducing the connection buffer pool to a
single buffer instance by using the following global statement :
tune.quic.frontend.conn-tx-buffers.limit 1.
Then a QUIC client is used which opens a stream for a large enough
object to ensure data are buffered. The client them emits a STOP_SENDING
before reading all data, which forces the corresponding QCS instance to
be resetted. The client then opens a new request but the transfer is
freezed due to this bug.
To fix this, adjust qc_stream_desc API. Add a new argument <final_size>
on qc_stream_desc_release() function. Its value is compared to the
currently buffered offset in latest qc_stream_desc buffer. If
<final_size> is inferior, it means unsent data are present in the
buffer. As such, qc_stream_desc_release() removes them to ensure the
buffer will finally be freed when all ACKs are received. It is also
possible that no data remains immediately, indicating that ACK were
already received. As such, buffer instance is immediately removed by
qc_stream_buf_free().
This must be backported up to 2.6. As this code section is known to
regression, a period of observation could be reserved before
distributing it on LTS releases.
2024-01-26 08:41:04 -05:00
|
|
|
void qc_stream_desc_release(struct qc_stream_desc *stream,
|
2024-09-25 12:25:08 -04:00
|
|
|
uint64_t final_size, void *new_ctx)
|
2022-04-19 11:21:11 -04:00
|
|
|
{
|
2022-04-19 11:59:50 -04:00
|
|
|
/* A stream can be released only one time. */
|
2024-08-05 12:52:27 -04:00
|
|
|
BUG_ON(stream->flags & QC_SD_FL_RELEASE);
|
2022-04-19 11:21:11 -04:00
|
|
|
|
2024-08-05 12:52:27 -04:00
|
|
|
stream->flags |= QC_SD_FL_RELEASE;
|
2024-09-25 12:25:08 -04:00
|
|
|
stream->ctx = new_ctx;
|
2022-04-19 11:21:11 -04:00
|
|
|
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
/* Release active buffer if still present on streamdesc release. */
|
BUG/MEDIUM: quic: remove unsent data from qc_stream_desc buf
QCS instances use qc_stream_desc for data buffering on emission. On
stream reset, its Tx channel is closed earlier than expected. This may
leave unsent data into qc_stream_desc.
Before this patch, these unsent data would remain after QCS freeing.
This prevents the buffer to be released as no ACK reception will remove
them. The buffer is only freed when the whole connection is closed. As
qc_stream_desc buffer is limited per connection, this reduces the buffer
pool for other streams of the same connection. In the worst case if
several streams are resetted, this may completely freeze the transfer of
the remaining connection streams.
This bug was reproduced by reducing the connection buffer pool to a
single buffer instance by using the following global statement :
tune.quic.frontend.conn-tx-buffers.limit 1.
Then a QUIC client is used which opens a stream for a large enough
object to ensure data are buffered. The client them emits a STOP_SENDING
before reading all data, which forces the corresponding QCS instance to
be resetted. The client then opens a new request but the transfer is
freezed due to this bug.
To fix this, adjust qc_stream_desc API. Add a new argument <final_size>
on qc_stream_desc_release() function. Its value is compared to the
currently buffered offset in latest qc_stream_desc buffer. If
<final_size> is inferior, it means unsent data are present in the
buffer. As such, qc_stream_desc_release() removes them to ensure the
buffer will finally be freed when all ACKs are received. It is also
possible that no data remains immediately, indicating that ACK were
already received. As such, buffer instance is immediately removed by
qc_stream_buf_free().
This must be backported up to 2.6. As this code section is known to
regression, a period of observation could be reserved before
distributing it on LTS releases.
2024-01-26 08:41:04 -05:00
|
|
|
if (stream->buf) {
|
|
|
|
|
struct qc_stream_buf *stream_buf = stream->buf;
|
|
|
|
|
struct buffer *buf = &stream_buf->buf;
|
|
|
|
|
const uint64_t tail_offset =
|
|
|
|
|
MAX(stream->buf_offset, stream->ack_offset) + b_data(buf);
|
|
|
|
|
|
|
|
|
|
/* final_size cannot be greater than all currently stored data. */
|
|
|
|
|
BUG_ON(final_size > tail_offset);
|
|
|
|
|
|
|
|
|
|
/* Remove unsent data from current buffer. */
|
2024-08-07 12:01:51 -04:00
|
|
|
if (final_size < tail_offset)
|
BUG/MEDIUM: quic: remove unsent data from qc_stream_desc buf
QCS instances use qc_stream_desc for data buffering on emission. On
stream reset, its Tx channel is closed earlier than expected. This may
leave unsent data into qc_stream_desc.
Before this patch, these unsent data would remain after QCS freeing.
This prevents the buffer to be released as no ACK reception will remove
them. The buffer is only freed when the whole connection is closed. As
qc_stream_desc buffer is limited per connection, this reduces the buffer
pool for other streams of the same connection. In the worst case if
several streams are resetted, this may completely freeze the transfer of
the remaining connection streams.
This bug was reproduced by reducing the connection buffer pool to a
single buffer instance by using the following global statement :
tune.quic.frontend.conn-tx-buffers.limit 1.
Then a QUIC client is used which opens a stream for a large enough
object to ensure data are buffered. The client them emits a STOP_SENDING
before reading all data, which forces the corresponding QCS instance to
be resetted. The client then opens a new request but the transfer is
freezed due to this bug.
To fix this, adjust qc_stream_desc API. Add a new argument <final_size>
on qc_stream_desc_release() function. Its value is compared to the
currently buffered offset in latest qc_stream_desc buffer. If
<final_size> is inferior, it means unsent data are present in the
buffer. As such, qc_stream_desc_release() removes them to ensure the
buffer will finally be freed when all ACKs are received. It is also
possible that no data remains immediately, indicating that ACK were
already received. As such, buffer instance is immediately removed by
qc_stream_buf_free().
This must be backported up to 2.6. As this code section is known to
regression, a period of observation could be reserved before
distributing it on LTS releases.
2024-01-26 08:41:04 -05:00
|
|
|
b_sub(buf, tail_offset - final_size);
|
2024-08-07 12:01:51 -04:00
|
|
|
|
2025-04-01 16:44:54 -04:00
|
|
|
/* Release active buffer, or delete it immediately if there is
|
2024-10-09 05:59:32 -04:00
|
|
|
* no data to acknowledge. Both functions will reset active
|
|
|
|
|
* buf pointer and invoke <notify_room> if necessary.
|
|
|
|
|
*/
|
|
|
|
|
if (!b_data(buf))
|
2024-08-07 12:01:51 -04:00
|
|
|
qc_stream_buf_free(stream, &stream_buf);
|
2024-10-09 05:59:32 -04:00
|
|
|
else
|
|
|
|
|
qc_stream_buf_release(stream);
|
BUG/MEDIUM: quic: remove unsent data from qc_stream_desc buf
QCS instances use qc_stream_desc for data buffering on emission. On
stream reset, its Tx channel is closed earlier than expected. This may
leave unsent data into qc_stream_desc.
Before this patch, these unsent data would remain after QCS freeing.
This prevents the buffer to be released as no ACK reception will remove
them. The buffer is only freed when the whole connection is closed. As
qc_stream_desc buffer is limited per connection, this reduces the buffer
pool for other streams of the same connection. In the worst case if
several streams are resetted, this may completely freeze the transfer of
the remaining connection streams.
This bug was reproduced by reducing the connection buffer pool to a
single buffer instance by using the following global statement :
tune.quic.frontend.conn-tx-buffers.limit 1.
Then a QUIC client is used which opens a stream for a large enough
object to ensure data are buffered. The client them emits a STOP_SENDING
before reading all data, which forces the corresponding QCS instance to
be resetted. The client then opens a new request but the transfer is
freezed due to this bug.
To fix this, adjust qc_stream_desc API. Add a new argument <final_size>
on qc_stream_desc_release() function. Its value is compared to the
currently buffered offset in latest qc_stream_desc buffer. If
<final_size> is inferior, it means unsent data are present in the
buffer. As such, qc_stream_desc_release() removes them to ensure the
buffer will finally be freed when all ACKs are received. It is also
possible that no data remains immediately, indicating that ACK were
already received. As such, buffer instance is immediately removed by
qc_stream_buf_free().
This must be backported up to 2.6. As this code section is known to
regression, a period of observation could be reserved before
distributing it on LTS releases.
2024-01-26 08:41:04 -05:00
|
|
|
}
|
|
|
|
|
|
BUG/MEDIUM: quic: handle retransmit for standalone FIN STREAM
STREAM frames have dedicated handling on retransmission. A special check
is done to remove data already acked in case of duplicated frames, thus
only unacked data are retransmitted.
This handling is faulty in case of an empty STREAM frame with FIN set.
On retransmission, this frame does not cover any unacked range as it is
empty and is thus discarded. This may cause the transfer to freeze with
the client waiting indefinitely for the FIN notification.
To handle retransmission of empty FIN STREAM frame, qc_stream_desc layer
have been extended. A new flag QC_SD_FL_WAIT_FOR_FIN is set by MUX QUIC
when FIN has been transmitted. If set, it prevents qc_stream_desc to be
freed until FIN is acknowledged. On retransmission side,
qc_stream_frm_is_acked() has been updated. It now reports false if
FIN bit is set on the frame and qc_stream_desc has QC_SD_FL_WAIT_FOR_FIN
set.
This must be backported up to 2.6. However, this modifies heavily
critical section for ACK handling and retransmission. As such, it must
be backported only after a period of observation.
This issue can be reproduced by using the following socat command as
server to add delay between the response and connection closure :
$ socat TCP-LISTEN:<port>,fork,reuseaddr,crlf SYSTEM:'echo "HTTP/1.1 200 OK"; echo ""; sleep 1;'
On the client side, ngtcp2 can be used to simulate packet drop. Without
this patch, connection will be interrupted on QUIC idle timeout or
haproxy client timeout with ERR_DRAINING on ngtcp2 :
$ ngtcp2-client --exit-on-all-streams-close -r 0.3 <host> <port> "http://<host>:<port>/?s=32o"
Alternatively to ngtcp2 random loss, an extra haproxy patch can also be
used to force skipping the emission of the empty STREAM frame :
diff --git a/include/haproxy/quic_tx-t.h b/include/haproxy/quic_tx-t.h
index efbdfe687..1ff899acd 100644
--- a/include/haproxy/quic_tx-t.h
+++ b/include/haproxy/quic_tx-t.h
@@ -26,6 +26,8 @@ extern struct pool_head *pool_head_quic_cc_buf;
/* Flag a sent packet as being probing with old data */
#define QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA (1UL << 5)
+#define QUIC_FL_TX_PACKET_SKIP_SENDTO (1UL << 6)
+
/* Structure to store enough information about TX QUIC packets. */
struct quic_tx_packet {
/* List entry point. */
diff --git a/src/quic_tx.c b/src/quic_tx.c
index 2f199ac3c..2702fc9b9 100644
--- a/src/quic_tx.c
+++ b/src/quic_tx.c
@@ -318,7 +318,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
tmpbuf.size = tmpbuf.data = dglen;
TRACE_PROTO("TX dgram", QUIC_EV_CONN_SPPKTS, qc);
- if (!skip_sendto) {
+ if (!skip_sendto && !(first_pkt->flags & QUIC_FL_TX_PACKET_SKIP_SENDTO)) {
int ret = qc_snd_buf(qc, &tmpbuf, tmpbuf.data, 0, gso);
if (ret < 0) {
if (gso && ret == -EIO) {
@@ -354,6 +354,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
qc->cntrs.sent_bytes_gso += ret;
}
}
+ first_pkt->flags &= ~QUIC_FL_TX_PACKET_SKIP_SENDTO;
b_del(buf, dglen + QUIC_DGRAM_HEADLEN);
qc->bytes.tx += tmpbuf.data;
@@ -2066,6 +2067,17 @@ static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end,
continue;
}
+ switch (cf->type) {
+ case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F:
+ if (!cf->stream.len && (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT)) {
+ TRACE_USER("artificially drop packet with empty STREAM frame", QUIC_EV_CONN_TXPKT, qc);
+ pkt->flags |= QUIC_FL_TX_PACKET_SKIP_SENDTO;
+ }
+ break;
+ default:
+ break;
+ }
+
quic_tx_packet_refinc(pkt);
cf->pkt = pkt;
}
2024-08-05 12:58:49 -04:00
|
|
|
if (qc_stream_desc_done(stream)) {
|
2022-04-15 11:29:25 -04:00
|
|
|
/* if no buffer left we can free the stream. */
|
2022-08-20 12:59:36 -04:00
|
|
|
qc_stream_desc_free(stream, 0);
|
2022-04-15 11:29:25 -04:00
|
|
|
}
|
2022-04-19 11:21:11 -04:00
|
|
|
}
|
|
|
|
|
|
2024-10-02 08:44:41 -04:00
|
|
|
static int qc_stream_buf_is_released(const struct qc_stream_buf *buf,
|
|
|
|
|
const struct qc_stream_desc *stream)
|
|
|
|
|
{
|
|
|
|
|
return buf != stream->buf;
|
|
|
|
|
}
|
|
|
|
|
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
/* Store an out-of-order stream ACK for <buf>. This corresponds to a frame
|
|
|
|
|
* starting at <offset> of length <len> with <fin> set if FIN is present.
|
|
|
|
|
*
|
2024-10-02 08:44:41 -04:00
|
|
|
* Returns the count of newly acknowledged data, or a negative error code if
|
|
|
|
|
* the new range cannot be stored due to a fatal error.
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
*/
|
|
|
|
|
static int qc_stream_buf_store_ack(struct qc_stream_buf *buf,
|
2024-10-02 08:44:41 -04:00
|
|
|
struct qc_stream_desc *stream,
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
uint64_t offset, uint64_t len, int fin)
|
|
|
|
|
{
|
|
|
|
|
struct eb64_node *less, *more;
|
|
|
|
|
struct qc_stream_ack *ack, *ack_less = NULL, *ack_more = NULL;
|
2024-10-02 08:44:41 -04:00
|
|
|
int newly_acked = len;
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
|
|
|
|
|
more = eb64_lookup_ge(&buf->ack_tree, offset);
|
|
|
|
|
if (more)
|
|
|
|
|
ack_more = eb64_entry(more, struct qc_stream_ack, offset_node);
|
|
|
|
|
|
|
|
|
|
/* Ranges are always merged before insertion so there could be no
|
|
|
|
|
* overlapping or just contiguous different ranges. No need to use
|
|
|
|
|
* <ack_less> if an existing range already starts at requested offset.
|
|
|
|
|
*/
|
|
|
|
|
less = eb64_lookup_le(&buf->ack_tree, offset);
|
|
|
|
|
if (less && more != less)
|
|
|
|
|
ack_less = eb64_entry(less, struct qc_stream_ack, offset_node);
|
|
|
|
|
|
|
|
|
|
/* Ensure that offset:len range has not been already acknowledged, at least partially. */
|
2024-10-09 06:03:36 -04:00
|
|
|
if ((ack_more && offset == ack_more->offset_node.key && offset + len <= ack_more->offset_node.key + ack_more->len) ||
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
(ack_less && ack_less->offset_node.key + ack_less->len >= offset + len)) {
|
2024-10-02 08:44:41 -04:00
|
|
|
newly_acked = 0;
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
goto end;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If current range is contiguous or overlapping with one or several
|
|
|
|
|
* superior ranges, extend current range and delete superior ranges.
|
|
|
|
|
*/
|
|
|
|
|
while (ack_more && offset + len >= ack_more->offset_node.key) {
|
|
|
|
|
struct eb64_node *next;
|
|
|
|
|
|
|
|
|
|
if (offset + len < ack_more->offset_node.key + ack_more->len) {
|
2024-10-02 08:44:41 -04:00
|
|
|
newly_acked -= (offset + len) - ack_more->offset_node.key;
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
/* Extend current range to cover the next entry. */
|
|
|
|
|
len += (ack_more->offset_node.key + ack_more->len) - (offset + len);
|
|
|
|
|
fin = ack_more->fin;
|
|
|
|
|
}
|
2024-10-02 08:44:41 -04:00
|
|
|
else {
|
|
|
|
|
newly_acked -= ack_more->len;
|
|
|
|
|
}
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
|
|
|
|
|
/* Remove the next range as it is covered by the current one. */
|
|
|
|
|
next = eb64_next(more);
|
|
|
|
|
eb64_delete(more);
|
|
|
|
|
pool_free(pool_head_quic_stream_ack, ack_more);
|
|
|
|
|
|
|
|
|
|
more = next;
|
|
|
|
|
ack_more = more ? eb64_entry(more, struct qc_stream_ack, offset_node) : NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If there is a contiguous or overlapping smaller range, extend it
|
|
|
|
|
* without adding a new entry.
|
|
|
|
|
*/
|
|
|
|
|
if (ack_less &&
|
|
|
|
|
ack_less->offset_node.key + ack_less->len >= offset) {
|
2024-10-02 08:44:41 -04:00
|
|
|
newly_acked -= (ack_less->offset_node.key + ack_less->len) - offset;
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
/* Extend previous entry to fully cover the current range. */
|
|
|
|
|
ack_less->len += (offset + len) -
|
|
|
|
|
(ack_less->offset_node.key + ack_less->len);
|
|
|
|
|
ack_less->fin = fin;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* Store a new ACK stream range. */
|
|
|
|
|
ack = pool_alloc(pool_head_quic_stream_ack);
|
|
|
|
|
if (!ack) {
|
2024-10-02 08:44:41 -04:00
|
|
|
newly_acked = -1;
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
goto end;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ack->offset_node.key = offset;
|
|
|
|
|
ack->len = len;
|
|
|
|
|
ack->fin = fin;
|
|
|
|
|
|
|
|
|
|
eb64_insert(&buf->ack_tree, &ack->offset_node);
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-02 08:44:41 -04:00
|
|
|
buf->room += newly_acked;
|
|
|
|
|
if (stream->notify_room && qc_stream_buf_is_released(buf, stream))
|
|
|
|
|
stream->notify_room(stream, newly_acked);
|
|
|
|
|
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
end:
|
2024-10-02 08:44:41 -04:00
|
|
|
return newly_acked;
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
}
|
|
|
|
|
|
2024-09-30 03:48:29 -04:00
|
|
|
/* Acknowledges data for buffer <buf> attached to <stream> instance. This covers
|
2025-04-01 16:44:54 -04:00
|
|
|
* the range starting at <offset> and of length <len>, with <fin> sets for the
|
2024-09-30 03:48:29 -04:00
|
|
|
* last stream frame.
|
|
|
|
|
*
|
|
|
|
|
* Returns <buf> if there is still data to acknowledge or buffered ACK to
|
|
|
|
|
* consume after completing the operation. Else, the next buffer instance of
|
|
|
|
|
* stream is returned if it exists or NULL in the contrary case.
|
|
|
|
|
*/
|
|
|
|
|
static struct qc_stream_buf *qc_stream_buf_ack(struct qc_stream_buf *buf,
|
|
|
|
|
struct qc_stream_desc *stream,
|
|
|
|
|
uint64_t offset, uint64_t len, int fin)
|
|
|
|
|
{
|
2024-10-02 08:44:41 -04:00
|
|
|
uint64_t diff;
|
|
|
|
|
|
2024-09-30 03:48:29 -04:00
|
|
|
/* This function does not deal with out-of-order ACK. */
|
|
|
|
|
BUG_ON(offset > stream->ack_offset);
|
|
|
|
|
|
|
|
|
|
if (offset + len > stream->ack_offset) {
|
2024-10-02 08:44:41 -04:00
|
|
|
diff = offset + len - stream->ack_offset;
|
2024-09-30 03:48:29 -04:00
|
|
|
b_del(&buf->buf, diff);
|
|
|
|
|
stream->ack_offset += diff;
|
2025-05-07 11:32:46 -04:00
|
|
|
bdata_ctr_del(&stream->data, diff);
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
|
|
|
|
|
/* notify room from acked data if buffer has been released. */
|
2024-10-02 08:44:41 -04:00
|
|
|
if (stream->notify_room && qc_stream_buf_is_released(buf, stream)) {
|
|
|
|
|
if (diff >= buf->room) {
|
|
|
|
|
diff -= buf->room;
|
|
|
|
|
buf->room = 0;
|
|
|
|
|
stream->notify_room(stream, diff);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
buf->room -= diff;
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-09-30 03:48:29 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (fin) {
|
|
|
|
|
/* Mark FIN as acknowledged. */
|
|
|
|
|
stream->flags &= ~QC_SD_FL_WAIT_FOR_FIN;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-01 11:34:55 -04:00
|
|
|
if (!b_data(&buf->buf) && eb_is_empty(&buf->ack_tree)) {
|
2024-09-30 03:48:29 -04:00
|
|
|
qc_stream_buf_free(stream, &buf);
|
|
|
|
|
/* Retrieve next buffer instance. */
|
|
|
|
|
buf = !eb_is_empty(&stream->buf_tree) ?
|
|
|
|
|
eb64_entry(eb64_first(&stream->buf_tree), struct qc_stream_buf, offset_node) :
|
|
|
|
|
NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Consume buffered ACK starting at <stream_buf>. If all buffer data is
|
|
|
|
|
* removed, <stream_buf> is freed and consume will be conducted for following
|
|
|
|
|
* streambufs from <stream> if present.
|
|
|
|
|
*/
|
|
|
|
|
static void qc_stream_buf_consume(struct qc_stream_buf *stream_buf,
|
|
|
|
|
struct qc_stream_desc *stream)
|
|
|
|
|
{
|
2024-10-01 11:34:55 -04:00
|
|
|
struct qc_stream_ack *ack;
|
|
|
|
|
struct eb64_node *ack_node;
|
|
|
|
|
|
|
|
|
|
ack_node = eb64_first(&stream_buf->ack_tree);
|
|
|
|
|
while (ack_node) {
|
|
|
|
|
ack = eb64_entry(ack_node, struct qc_stream_ack, offset_node);
|
|
|
|
|
if (ack->offset_node.key > stream->ack_offset)
|
2024-09-30 03:48:29 -04:00
|
|
|
break;
|
|
|
|
|
|
2026-04-27 21:43:22 -04:00
|
|
|
/* For the active buf, room count is decremented on buffered ACK
|
|
|
|
|
* consumption.
|
|
|
|
|
*/
|
2024-10-02 08:44:41 -04:00
|
|
|
if (stream_buf == stream->buf)
|
|
|
|
|
stream_buf->room = MAX((int64_t)(stream_buf->room - ack->len), 0);
|
|
|
|
|
|
2024-10-01 11:34:55 -04:00
|
|
|
/* Delete range before acknowledged it. This prevents BUG_ON()
|
|
|
|
|
* on non-empty ack_tree tree when stream_buf is empty and removed.
|
2024-09-30 03:48:29 -04:00
|
|
|
*/
|
2024-10-01 11:34:55 -04:00
|
|
|
eb64_delete(ack_node);
|
|
|
|
|
stream_buf = qc_stream_buf_ack(stream_buf, stream,
|
|
|
|
|
ack->offset_node.key, ack->len, ack->fin);
|
|
|
|
|
pool_free(pool_head_quic_stream_ack, ack);
|
2024-09-30 03:48:29 -04:00
|
|
|
|
2024-10-01 11:34:55 -04:00
|
|
|
ack_node = stream_buf ? eb64_first(&stream_buf->ack_tree) : NULL;
|
2024-09-30 03:48:29 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-30 03:21:10 -04:00
|
|
|
/* Acknowledge <frm> STREAM frame whose content is managed by <stream>
|
|
|
|
|
* descriptor.
|
2022-04-21 03:32:53 -04:00
|
|
|
*
|
2024-09-30 03:21:10 -04:00
|
|
|
* Returns 0 if the frame has been handled and can be removed.
|
|
|
|
|
* Returns a positive value if acknowledgement is out-of-order and
|
|
|
|
|
* corresponding STREAM frame has been buffered.
|
2024-10-01 11:34:55 -04:00
|
|
|
* Returns a negative value on fatal error.
|
2022-04-21 03:32:53 -04:00
|
|
|
*/
|
2024-10-01 11:34:55 -04:00
|
|
|
int qc_stream_desc_ack(struct qc_stream_desc *stream,
|
|
|
|
|
uint64_t offset, uint64_t len, int fin)
|
2022-04-21 03:32:53 -04:00
|
|
|
{
|
BUG/MEDIUM: quic: handle retransmit for standalone FIN STREAM
STREAM frames have dedicated handling on retransmission. A special check
is done to remove data already acked in case of duplicated frames, thus
only unacked data are retransmitted.
This handling is faulty in case of an empty STREAM frame with FIN set.
On retransmission, this frame does not cover any unacked range as it is
empty and is thus discarded. This may cause the transfer to freeze with
the client waiting indefinitely for the FIN notification.
To handle retransmission of empty FIN STREAM frame, qc_stream_desc layer
have been extended. A new flag QC_SD_FL_WAIT_FOR_FIN is set by MUX QUIC
when FIN has been transmitted. If set, it prevents qc_stream_desc to be
freed until FIN is acknowledged. On retransmission side,
qc_stream_frm_is_acked() has been updated. It now reports false if
FIN bit is set on the frame and qc_stream_desc has QC_SD_FL_WAIT_FOR_FIN
set.
This must be backported up to 2.6. However, this modifies heavily
critical section for ACK handling and retransmission. As such, it must
be backported only after a period of observation.
This issue can be reproduced by using the following socat command as
server to add delay between the response and connection closure :
$ socat TCP-LISTEN:<port>,fork,reuseaddr,crlf SYSTEM:'echo "HTTP/1.1 200 OK"; echo ""; sleep 1;'
On the client side, ngtcp2 can be used to simulate packet drop. Without
this patch, connection will be interrupted on QUIC idle timeout or
haproxy client timeout with ERR_DRAINING on ngtcp2 :
$ ngtcp2-client --exit-on-all-streams-close -r 0.3 <host> <port> "http://<host>:<port>/?s=32o"
Alternatively to ngtcp2 random loss, an extra haproxy patch can also be
used to force skipping the emission of the empty STREAM frame :
diff --git a/include/haproxy/quic_tx-t.h b/include/haproxy/quic_tx-t.h
index efbdfe687..1ff899acd 100644
--- a/include/haproxy/quic_tx-t.h
+++ b/include/haproxy/quic_tx-t.h
@@ -26,6 +26,8 @@ extern struct pool_head *pool_head_quic_cc_buf;
/* Flag a sent packet as being probing with old data */
#define QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA (1UL << 5)
+#define QUIC_FL_TX_PACKET_SKIP_SENDTO (1UL << 6)
+
/* Structure to store enough information about TX QUIC packets. */
struct quic_tx_packet {
/* List entry point. */
diff --git a/src/quic_tx.c b/src/quic_tx.c
index 2f199ac3c..2702fc9b9 100644
--- a/src/quic_tx.c
+++ b/src/quic_tx.c
@@ -318,7 +318,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
tmpbuf.size = tmpbuf.data = dglen;
TRACE_PROTO("TX dgram", QUIC_EV_CONN_SPPKTS, qc);
- if (!skip_sendto) {
+ if (!skip_sendto && !(first_pkt->flags & QUIC_FL_TX_PACKET_SKIP_SENDTO)) {
int ret = qc_snd_buf(qc, &tmpbuf, tmpbuf.data, 0, gso);
if (ret < 0) {
if (gso && ret == -EIO) {
@@ -354,6 +354,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
qc->cntrs.sent_bytes_gso += ret;
}
}
+ first_pkt->flags &= ~QUIC_FL_TX_PACKET_SKIP_SENDTO;
b_del(buf, dglen + QUIC_DGRAM_HEADLEN);
qc->bytes.tx += tmpbuf.data;
@@ -2066,6 +2067,17 @@ static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end,
continue;
}
+ switch (cf->type) {
+ case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F:
+ if (!cf->stream.len && (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT)) {
+ TRACE_USER("artificially drop packet with empty STREAM frame", QUIC_EV_CONN_TXPKT, qc);
+ pkt->flags |= QUIC_FL_TX_PACKET_SKIP_SENDTO;
+ }
+ break;
+ default:
+ break;
+ }
+
quic_tx_packet_refinc(pkt);
cf->pkt = pkt;
}
2024-08-05 12:58:49 -04:00
|
|
|
struct qc_stream_buf *stream_buf = NULL;
|
2024-09-30 03:21:10 -04:00
|
|
|
struct eb64_node *buf_node;
|
2024-09-30 03:48:29 -04:00
|
|
|
int ret = 0;
|
2022-04-21 03:32:53 -04:00
|
|
|
|
BUG/MEDIUM: quic: handle retransmit for standalone FIN STREAM
STREAM frames have dedicated handling on retransmission. A special check
is done to remove data already acked in case of duplicated frames, thus
only unacked data are retransmitted.
This handling is faulty in case of an empty STREAM frame with FIN set.
On retransmission, this frame does not cover any unacked range as it is
empty and is thus discarded. This may cause the transfer to freeze with
the client waiting indefinitely for the FIN notification.
To handle retransmission of empty FIN STREAM frame, qc_stream_desc layer
have been extended. A new flag QC_SD_FL_WAIT_FOR_FIN is set by MUX QUIC
when FIN has been transmitted. If set, it prevents qc_stream_desc to be
freed until FIN is acknowledged. On retransmission side,
qc_stream_frm_is_acked() has been updated. It now reports false if
FIN bit is set on the frame and qc_stream_desc has QC_SD_FL_WAIT_FOR_FIN
set.
This must be backported up to 2.6. However, this modifies heavily
critical section for ACK handling and retransmission. As such, it must
be backported only after a period of observation.
This issue can be reproduced by using the following socat command as
server to add delay between the response and connection closure :
$ socat TCP-LISTEN:<port>,fork,reuseaddr,crlf SYSTEM:'echo "HTTP/1.1 200 OK"; echo ""; sleep 1;'
On the client side, ngtcp2 can be used to simulate packet drop. Without
this patch, connection will be interrupted on QUIC idle timeout or
haproxy client timeout with ERR_DRAINING on ngtcp2 :
$ ngtcp2-client --exit-on-all-streams-close -r 0.3 <host> <port> "http://<host>:<port>/?s=32o"
Alternatively to ngtcp2 random loss, an extra haproxy patch can also be
used to force skipping the emission of the empty STREAM frame :
diff --git a/include/haproxy/quic_tx-t.h b/include/haproxy/quic_tx-t.h
index efbdfe687..1ff899acd 100644
--- a/include/haproxy/quic_tx-t.h
+++ b/include/haproxy/quic_tx-t.h
@@ -26,6 +26,8 @@ extern struct pool_head *pool_head_quic_cc_buf;
/* Flag a sent packet as being probing with old data */
#define QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA (1UL << 5)
+#define QUIC_FL_TX_PACKET_SKIP_SENDTO (1UL << 6)
+
/* Structure to store enough information about TX QUIC packets. */
struct quic_tx_packet {
/* List entry point. */
diff --git a/src/quic_tx.c b/src/quic_tx.c
index 2f199ac3c..2702fc9b9 100644
--- a/src/quic_tx.c
+++ b/src/quic_tx.c
@@ -318,7 +318,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
tmpbuf.size = tmpbuf.data = dglen;
TRACE_PROTO("TX dgram", QUIC_EV_CONN_SPPKTS, qc);
- if (!skip_sendto) {
+ if (!skip_sendto && !(first_pkt->flags & QUIC_FL_TX_PACKET_SKIP_SENDTO)) {
int ret = qc_snd_buf(qc, &tmpbuf, tmpbuf.data, 0, gso);
if (ret < 0) {
if (gso && ret == -EIO) {
@@ -354,6 +354,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
qc->cntrs.sent_bytes_gso += ret;
}
}
+ first_pkt->flags &= ~QUIC_FL_TX_PACKET_SKIP_SENDTO;
b_del(buf, dglen + QUIC_DGRAM_HEADLEN);
qc->bytes.tx += tmpbuf.data;
@@ -2066,6 +2067,17 @@ static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end,
continue;
}
+ switch (cf->type) {
+ case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F:
+ if (!cf->stream.len && (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT)) {
+ TRACE_USER("artificially drop packet with empty STREAM frame", QUIC_EV_CONN_TXPKT, qc);
+ pkt->flags |= QUIC_FL_TX_PACKET_SKIP_SENDTO;
+ }
+ break;
+ default:
+ break;
+ }
+
quic_tx_packet_refinc(pkt);
cf->pkt = pkt;
}
2024-08-05 12:58:49 -04:00
|
|
|
/* Cannot advertise FIN for an inferior data range. */
|
2024-09-26 10:14:40 -04:00
|
|
|
BUG_ON(fin && offset + len < stream->ack_offset);
|
2022-04-21 03:32:53 -04:00
|
|
|
|
2024-09-30 03:48:29 -04:00
|
|
|
/* Do nothing for offset + len < stream->ack_offset as data were
|
|
|
|
|
* already acknowledged and removed.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
if (!len) {
|
|
|
|
|
BUG_ON(!fin); /* An empty STREAM frame is only needed for a late FIN reporting. */
|
|
|
|
|
|
|
|
|
|
/* Empty STREAM frame with FIN can be acknowledged out-of-order. */
|
|
|
|
|
stream->flags &= ~QC_SD_FL_WAIT_FOR_FIN;
|
2024-09-30 03:21:10 -04:00
|
|
|
}
|
|
|
|
|
else if (offset > stream->ack_offset) {
|
|
|
|
|
buf_node = eb64_lookup_le(&stream->buf_tree, offset);
|
|
|
|
|
BUG_ON(!buf_node); /* Cannot acknowledged a STREAM frame for a non existing buffer. */
|
|
|
|
|
stream_buf = eb64_entry(buf_node, struct qc_stream_buf, offset_node);
|
2024-10-02 08:44:41 -04:00
|
|
|
ret = qc_stream_buf_store_ack(stream_buf, stream, offset, len, fin);
|
2024-09-30 03:21:10 -04:00
|
|
|
}
|
2024-09-30 03:48:29 -04:00
|
|
|
else if (offset + len > stream->ack_offset) {
|
BUG/MEDIUM: quic: handle retransmit for standalone FIN STREAM
STREAM frames have dedicated handling on retransmission. A special check
is done to remove data already acked in case of duplicated frames, thus
only unacked data are retransmitted.
This handling is faulty in case of an empty STREAM frame with FIN set.
On retransmission, this frame does not cover any unacked range as it is
empty and is thus discarded. This may cause the transfer to freeze with
the client waiting indefinitely for the FIN notification.
To handle retransmission of empty FIN STREAM frame, qc_stream_desc layer
have been extended. A new flag QC_SD_FL_WAIT_FOR_FIN is set by MUX QUIC
when FIN has been transmitted. If set, it prevents qc_stream_desc to be
freed until FIN is acknowledged. On retransmission side,
qc_stream_frm_is_acked() has been updated. It now reports false if
FIN bit is set on the frame and qc_stream_desc has QC_SD_FL_WAIT_FOR_FIN
set.
This must be backported up to 2.6. However, this modifies heavily
critical section for ACK handling and retransmission. As such, it must
be backported only after a period of observation.
This issue can be reproduced by using the following socat command as
server to add delay between the response and connection closure :
$ socat TCP-LISTEN:<port>,fork,reuseaddr,crlf SYSTEM:'echo "HTTP/1.1 200 OK"; echo ""; sleep 1;'
On the client side, ngtcp2 can be used to simulate packet drop. Without
this patch, connection will be interrupted on QUIC idle timeout or
haproxy client timeout with ERR_DRAINING on ngtcp2 :
$ ngtcp2-client --exit-on-all-streams-close -r 0.3 <host> <port> "http://<host>:<port>/?s=32o"
Alternatively to ngtcp2 random loss, an extra haproxy patch can also be
used to force skipping the emission of the empty STREAM frame :
diff --git a/include/haproxy/quic_tx-t.h b/include/haproxy/quic_tx-t.h
index efbdfe687..1ff899acd 100644
--- a/include/haproxy/quic_tx-t.h
+++ b/include/haproxy/quic_tx-t.h
@@ -26,6 +26,8 @@ extern struct pool_head *pool_head_quic_cc_buf;
/* Flag a sent packet as being probing with old data */
#define QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA (1UL << 5)
+#define QUIC_FL_TX_PACKET_SKIP_SENDTO (1UL << 6)
+
/* Structure to store enough information about TX QUIC packets. */
struct quic_tx_packet {
/* List entry point. */
diff --git a/src/quic_tx.c b/src/quic_tx.c
index 2f199ac3c..2702fc9b9 100644
--- a/src/quic_tx.c
+++ b/src/quic_tx.c
@@ -318,7 +318,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
tmpbuf.size = tmpbuf.data = dglen;
TRACE_PROTO("TX dgram", QUIC_EV_CONN_SPPKTS, qc);
- if (!skip_sendto) {
+ if (!skip_sendto && !(first_pkt->flags & QUIC_FL_TX_PACKET_SKIP_SENDTO)) {
int ret = qc_snd_buf(qc, &tmpbuf, tmpbuf.data, 0, gso);
if (ret < 0) {
if (gso && ret == -EIO) {
@@ -354,6 +354,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
qc->cntrs.sent_bytes_gso += ret;
}
}
+ first_pkt->flags &= ~QUIC_FL_TX_PACKET_SKIP_SENDTO;
b_del(buf, dglen + QUIC_DGRAM_HEADLEN);
qc->bytes.tx += tmpbuf.data;
@@ -2066,6 +2067,17 @@ static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end,
continue;
}
+ switch (cf->type) {
+ case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F:
+ if (!cf->stream.len && (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT)) {
+ TRACE_USER("artificially drop packet with empty STREAM frame", QUIC_EV_CONN_TXPKT, qc);
+ pkt->flags |= QUIC_FL_TX_PACKET_SKIP_SENDTO;
+ }
+ break;
+ default:
+ break;
+ }
+
quic_tx_packet_refinc(pkt);
cf->pkt = pkt;
}
2024-08-05 12:58:49 -04:00
|
|
|
/* Buf list cannot be empty if there is still unacked data. */
|
2024-10-01 05:27:37 -04:00
|
|
|
BUG_ON(eb_is_empty(&stream->buf_tree));
|
2022-04-21 03:32:53 -04:00
|
|
|
|
2024-10-01 05:27:37 -04:00
|
|
|
/* get oldest buffer from buf tree */
|
|
|
|
|
stream_buf = eb64_entry(eb64_first(&stream->buf_tree), struct qc_stream_buf, offset_node);
|
2024-09-30 03:48:29 -04:00
|
|
|
stream_buf = qc_stream_buf_ack(stream_buf, stream, offset, len, fin);
|
2022-04-15 11:30:49 -04:00
|
|
|
|
2024-09-30 03:48:29 -04:00
|
|
|
/* some data were acknowledged, try to consume buffered ACKs */
|
|
|
|
|
if (stream_buf)
|
|
|
|
|
qc_stream_buf_consume(stream_buf, stream);
|
BUG/MEDIUM: quic: handle retransmit for standalone FIN STREAM
STREAM frames have dedicated handling on retransmission. A special check
is done to remove data already acked in case of duplicated frames, thus
only unacked data are retransmitted.
This handling is faulty in case of an empty STREAM frame with FIN set.
On retransmission, this frame does not cover any unacked range as it is
empty and is thus discarded. This may cause the transfer to freeze with
the client waiting indefinitely for the FIN notification.
To handle retransmission of empty FIN STREAM frame, qc_stream_desc layer
have been extended. A new flag QC_SD_FL_WAIT_FOR_FIN is set by MUX QUIC
when FIN has been transmitted. If set, it prevents qc_stream_desc to be
freed until FIN is acknowledged. On retransmission side,
qc_stream_frm_is_acked() has been updated. It now reports false if
FIN bit is set on the frame and qc_stream_desc has QC_SD_FL_WAIT_FOR_FIN
set.
This must be backported up to 2.6. However, this modifies heavily
critical section for ACK handling and retransmission. As such, it must
be backported only after a period of observation.
This issue can be reproduced by using the following socat command as
server to add delay between the response and connection closure :
$ socat TCP-LISTEN:<port>,fork,reuseaddr,crlf SYSTEM:'echo "HTTP/1.1 200 OK"; echo ""; sleep 1;'
On the client side, ngtcp2 can be used to simulate packet drop. Without
this patch, connection will be interrupted on QUIC idle timeout or
haproxy client timeout with ERR_DRAINING on ngtcp2 :
$ ngtcp2-client --exit-on-all-streams-close -r 0.3 <host> <port> "http://<host>:<port>/?s=32o"
Alternatively to ngtcp2 random loss, an extra haproxy patch can also be
used to force skipping the emission of the empty STREAM frame :
diff --git a/include/haproxy/quic_tx-t.h b/include/haproxy/quic_tx-t.h
index efbdfe687..1ff899acd 100644
--- a/include/haproxy/quic_tx-t.h
+++ b/include/haproxy/quic_tx-t.h
@@ -26,6 +26,8 @@ extern struct pool_head *pool_head_quic_cc_buf;
/* Flag a sent packet as being probing with old data */
#define QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA (1UL << 5)
+#define QUIC_FL_TX_PACKET_SKIP_SENDTO (1UL << 6)
+
/* Structure to store enough information about TX QUIC packets. */
struct quic_tx_packet {
/* List entry point. */
diff --git a/src/quic_tx.c b/src/quic_tx.c
index 2f199ac3c..2702fc9b9 100644
--- a/src/quic_tx.c
+++ b/src/quic_tx.c
@@ -318,7 +318,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
tmpbuf.size = tmpbuf.data = dglen;
TRACE_PROTO("TX dgram", QUIC_EV_CONN_SPPKTS, qc);
- if (!skip_sendto) {
+ if (!skip_sendto && !(first_pkt->flags & QUIC_FL_TX_PACKET_SKIP_SENDTO)) {
int ret = qc_snd_buf(qc, &tmpbuf, tmpbuf.data, 0, gso);
if (ret < 0) {
if (gso && ret == -EIO) {
@@ -354,6 +354,7 @@ static int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx)
qc->cntrs.sent_bytes_gso += ret;
}
}
+ first_pkt->flags &= ~QUIC_FL_TX_PACKET_SKIP_SENDTO;
b_del(buf, dglen + QUIC_DGRAM_HEADLEN);
qc->bytes.tx += tmpbuf.data;
@@ -2066,6 +2067,17 @@ static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end,
continue;
}
+ switch (cf->type) {
+ case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F:
+ if (!cf->stream.len && (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT)) {
+ TRACE_USER("artificially drop packet with empty STREAM frame", QUIC_EV_CONN_TXPKT, qc);
+ pkt->flags |= QUIC_FL_TX_PACKET_SKIP_SENDTO;
+ }
+ break;
+ default:
+ break;
+ }
+
quic_tx_packet_refinc(pkt);
cf->pkt = pkt;
}
2024-08-05 12:58:49 -04:00
|
|
|
}
|
|
|
|
|
|
2024-09-30 03:48:29 -04:00
|
|
|
return ret;
|
2022-04-21 03:32:53 -04:00
|
|
|
}
|
|
|
|
|
|
2022-04-15 11:29:25 -04:00
|
|
|
/* Free the stream descriptor <stream> content. This function should be used
|
2022-08-20 12:59:36 -04:00
|
|
|
* when all its data have been acknowledged or on full connection closing if <closing>
|
|
|
|
|
* boolean is set to 1. It must only be called after the stream is released.
|
2022-04-19 11:21:11 -04:00
|
|
|
*/
|
2022-08-20 12:59:36 -04:00
|
|
|
void qc_stream_desc_free(struct qc_stream_desc *stream, int closing)
|
2022-04-19 11:21:11 -04:00
|
|
|
{
|
2024-10-01 05:27:37 -04:00
|
|
|
struct qc_stream_buf *buf;
|
2024-10-01 11:34:55 -04:00
|
|
|
struct eb64_node *ack_node, *buf_node;
|
2022-04-15 11:29:25 -04:00
|
|
|
unsigned int free_count = 0;
|
2022-04-19 11:21:11 -04:00
|
|
|
|
2022-04-15 11:29:25 -04:00
|
|
|
/* This function only deals with released streams. */
|
2024-08-05 12:52:27 -04:00
|
|
|
BUG_ON(!(stream->flags & QC_SD_FL_RELEASE));
|
2022-04-15 11:29:25 -04:00
|
|
|
|
|
|
|
|
/* free remaining stream buffers */
|
2024-10-01 05:27:37 -04:00
|
|
|
while (!eb_is_empty(&stream->buf_tree)) {
|
|
|
|
|
buf_node = eb64_first(&stream->buf_tree);
|
|
|
|
|
buf = eb64_entry(buf_node, struct qc_stream_buf, offset_node);
|
2022-04-15 11:29:25 -04:00
|
|
|
|
2024-09-27 09:31:21 -04:00
|
|
|
/* qc_stream_desc_free() can only be used after all data is
|
|
|
|
|
* acknowledged or on connection shutdown. In the contrary
|
|
|
|
|
* case, MUX must be notified about room available.
|
|
|
|
|
*/
|
|
|
|
|
BUG_ON(b_data(&buf->buf) && !closing);
|
2022-04-15 11:29:25 -04:00
|
|
|
|
2024-10-01 05:13:41 -04:00
|
|
|
/* qc_stream_desc might be freed before having received all its ACKs. */
|
2024-10-01 11:34:55 -04:00
|
|
|
while (!eb_is_empty(&buf->ack_tree)) {
|
|
|
|
|
struct qc_stream_ack *ack;
|
2024-10-01 05:13:41 -04:00
|
|
|
|
2024-10-01 11:34:55 -04:00
|
|
|
ack_node = eb64_first(&buf->ack_tree);
|
|
|
|
|
eb64_delete(ack_node);
|
2024-10-01 05:13:41 -04:00
|
|
|
|
2024-10-01 11:34:55 -04:00
|
|
|
ack = eb64_entry(ack_node, struct qc_stream_ack, offset_node);
|
|
|
|
|
pool_free(pool_head_quic_stream_ack, ack);
|
2024-10-01 05:13:41 -04:00
|
|
|
}
|
|
|
|
|
|
2026-03-09 02:38:22 -04:00
|
|
|
b_free(&buf->buf);
|
2024-09-27 09:31:21 -04:00
|
|
|
|
2024-10-01 05:27:37 -04:00
|
|
|
eb64_delete(&buf->offset_node);
|
2024-09-27 09:31:21 -04:00
|
|
|
pool_free(pool_head_quic_stream_buf, buf);
|
|
|
|
|
++free_count;
|
2022-04-15 11:30:49 -04:00
|
|
|
}
|
|
|
|
|
|
2024-09-27 09:31:21 -04:00
|
|
|
if (free_count)
|
|
|
|
|
offer_buffers(NULL, free_count);
|
|
|
|
|
|
2022-09-09 12:05:45 -04:00
|
|
|
if (stream->by_id.key != (uint64_t)-1)
|
|
|
|
|
eb64_delete(&stream->by_id);
|
2022-05-27 03:11:02 -04:00
|
|
|
pool_free(pool_head_quic_stream_desc, stream);
|
2022-04-15 11:29:25 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Return the current buffer of <stream>. May be NULL if not allocated. */
|
|
|
|
|
struct buffer *qc_stream_buf_get(struct qc_stream_desc *stream)
|
|
|
|
|
{
|
|
|
|
|
if (!stream->buf)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
return &stream->buf->buf;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-13 05:57:50 -04:00
|
|
|
/* Allocate a new current buffer for <stream>. This function is not allowed if
|
|
|
|
|
* current buffer is not NULL prior to this call. The new buffer represents
|
|
|
|
|
* stream payload at offset <offset>.
|
2022-04-15 11:29:25 -04:00
|
|
|
*
|
2024-08-13 05:57:50 -04:00
|
|
|
* Returns the buffer or NULL on error.
|
2022-04-15 11:29:25 -04:00
|
|
|
*/
|
|
|
|
|
struct buffer *qc_stream_buf_alloc(struct qc_stream_desc *stream,
|
2024-06-13 09:26:51 -04:00
|
|
|
uint64_t offset, int small)
|
2022-04-15 11:29:25 -04:00
|
|
|
{
|
|
|
|
|
/* current buffer must be released first before allocate a new one. */
|
|
|
|
|
BUG_ON(stream->buf);
|
|
|
|
|
|
|
|
|
|
stream->buf_offset = offset;
|
2022-05-27 03:11:02 -04:00
|
|
|
stream->buf = pool_alloc(pool_head_quic_stream_buf);
|
2022-04-15 11:29:25 -04:00
|
|
|
if (!stream->buf)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
MEDIUM: quic: merge contiguous/overlapping buffered ack stream range
Transfer throughput was deteriorated since recent rework of QUIC MUX
txbuf allocator. This was partially restorated with the commit to
decount individual in-order ACK from the MUX buffer window.
To fully retrieve the old performance level, all ACKs must be decounted
when handled by QUIC streamdesc layer, event out-of-order ranges.
However, this is not easily implemented as several ranges may exist in
parallel with overlap on the underlying data. It would cause
miscalculation for QUIC MUX buffer window if such ranges were blindly
reported.
The proper solution is to first implement merge of contiguous or
overlapping ACK data ranges to reduce the number of stored ranges to the
minimal. This is the purpose of this patch. This is implemented in a new
static function named qc_stream_buf_store_ack() into streamdesc layer.
The merge algorithm is simple enough. First, it ensures the newly added
range is not already fully covered by a preexisting entry. Then, it
checks if there is contiguity/overlap with one or several ranges
starting at the same of a greater offset. If true, the newly added entry
is extended to cover them all, and all contiguous/overlapped ranges are
removed. Finally, if there is contiguity or overlap with an entry
starting at a smaller offset, no new range is instantiated and instead
the smaller offset is extended.
Now that contiguous or overlapped ranges cannot exits anymore, ACK data
ranges tree instiatiation can used EB_ROOT_UNIQUE.
Outside of the longer term objective which is to decount out-of-order
ACKs from MUX txbuf window, this commit could also improve some
performance and/or memory usage for connections where stream data
fragmentation and packet reording is high.
2024-10-02 04:23:21 -04:00
|
|
|
stream->buf->ack_tree = EB_ROOT_UNIQUE;
|
2024-10-02 08:44:41 -04:00
|
|
|
stream->buf->room = 0;
|
2024-08-13 05:57:50 -04:00
|
|
|
stream->buf->buf = BUF_NULL;
|
2024-10-01 05:27:37 -04:00
|
|
|
stream->buf->offset_node.key = offset;
|
2024-06-13 09:26:51 -04:00
|
|
|
|
2026-03-09 02:38:22 -04:00
|
|
|
if (!small || !global.tune.bufsize_small) {
|
2024-06-13 09:26:51 -04:00
|
|
|
stream->buf->sbuf = 0;
|
|
|
|
|
if (!b_alloc(&stream->buf->buf, DB_MUX_TX)) {
|
|
|
|
|
pool_free(pool_head_quic_stream_buf, stream->buf);
|
|
|
|
|
stream->buf = NULL;
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
2026-03-10 03:12:15 -04:00
|
|
|
if (!b_alloc_small(&stream->buf->buf)) {
|
2024-06-13 09:26:51 -04:00
|
|
|
pool_free(pool_head_quic_stream_buf, stream->buf);
|
|
|
|
|
stream->buf = NULL;
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
stream->buf->sbuf = 1;
|
2024-08-13 05:08:08 -04:00
|
|
|
}
|
2023-05-11 10:52:48 -04:00
|
|
|
|
2024-10-01 05:27:37 -04:00
|
|
|
eb64_insert(&stream->buf_tree, &stream->buf->offset_node);
|
2025-05-07 11:32:46 -04:00
|
|
|
bdata_ctr_binc(&stream->data);
|
2022-04-15 11:29:25 -04:00
|
|
|
|
|
|
|
|
return &stream->buf->buf;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-29 11:01:38 -04:00
|
|
|
/* Free current <stream> buffer and allocate a new one. This function is reserved
|
|
|
|
|
* to convert a small buffer to a standard one.
|
|
|
|
|
*
|
|
|
|
|
* Returns the buffer or NULL on error.
|
|
|
|
|
*/
|
|
|
|
|
struct buffer *qc_stream_buf_realloc(struct qc_stream_desc *stream)
|
|
|
|
|
{
|
2026-04-27 21:43:22 -04:00
|
|
|
/* This function is reserved to convert a small buffer to a standard one. */
|
2024-07-29 11:01:38 -04:00
|
|
|
BUG_ON(!stream->buf || !stream->buf->sbuf);
|
|
|
|
|
|
2025-04-01 16:44:54 -04:00
|
|
|
/* This function can only be used if targeted buffer is empty. */
|
2024-10-01 04:55:40 -04:00
|
|
|
BUG_ON(b_data(&stream->buf->buf));
|
|
|
|
|
|
2024-07-29 11:01:38 -04:00
|
|
|
/* Release buffer */
|
2026-03-09 02:38:22 -04:00
|
|
|
b_free(&stream->buf->buf);
|
2024-07-29 11:01:38 -04:00
|
|
|
stream->buf->buf = BUF_NULL;
|
|
|
|
|
stream->buf->sbuf = 0;
|
|
|
|
|
|
|
|
|
|
if (!b_alloc(&stream->buf->buf, DB_MUX_TX)) {
|
2024-10-01 05:27:37 -04:00
|
|
|
eb64_delete(&stream->buf->offset_node);
|
2024-07-29 11:01:38 -04:00
|
|
|
pool_free(pool_head_quic_stream_buf, stream->buf);
|
|
|
|
|
stream->buf = NULL;
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return &stream->buf->buf;
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-15 11:29:25 -04:00
|
|
|
/* Release the current buffer of <stream>. It will be kept internally by
|
|
|
|
|
* the <stream>. The current buffer cannot be NULL.
|
|
|
|
|
*/
|
|
|
|
|
void qc_stream_buf_release(struct qc_stream_desc *stream)
|
|
|
|
|
{
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
uint64_t room;
|
|
|
|
|
|
2022-04-15 11:29:25 -04:00
|
|
|
/* current buffer already released */
|
|
|
|
|
BUG_ON(!stream->buf);
|
|
|
|
|
|
2024-10-02 08:44:41 -04:00
|
|
|
room = b_room(&stream->buf->buf) + stream->buf->room;
|
2022-04-15 11:29:25 -04:00
|
|
|
stream->buf = NULL;
|
|
|
|
|
stream->buf_offset = 0;
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
|
2024-10-02 08:44:41 -04:00
|
|
|
/* Released buffer won't receive any new data. Reports non consumed
|
|
|
|
|
* space plus already stored out-of-order data range as available.
|
|
|
|
|
*/
|
MEDIUM: quic: decount acknowledged data for MUX txbuf window
Recently, a new allocation mechanism was implemented for Tx buffers used
by QUIC MUX. Now, underlying congestion window size is used to determine
if it is still possible or not to allocate a new buffer when necessary.
This mechanism has render the QUIC stack more flexible. However, it also
has brought some performance degradation, with transfer time longer in
certain environment. It was first discovered on the measurement results
of the interop. It can also easily be reproduced using the following
ngtcp2-client example which forces a very small congestion window due to
frequent loss :
$ ngtcp2-client -q --no-quic-dump --no-http-dump --exit-on-all-streams-close -r 0.1 127.0.0.1 20443 "https://[::]:20443/?s=10m"
This performance decrease is caused by the allocator which is now too
strict. It may cause buffer underrun frequently at the MUX layer when
the congestion window is too small, as new buffers cannot be allocated
until the current one is fully acknowledged. This resuls in transfers
with very bad throughput utilisation. The objective of this new serie of
patches is to relax some restrictions to permit QUIC MUX to allocate new
buffers more quickly, while preserving the initial limitation based on
congestion window size.
An interesting method for this is to notify QUIC MUX about newly
available room on individual ACK reception, without waiting for the full
bffer acknowledgement. This is easily implemented by adding a new
notify_room invokation in QUIC streamdesc layer on ACK reception.
However, ACK reception are handled in-order at the stream level. Out of
order ACKs are buffered and are not decounted for now. This will be
implemented in a future commit.
Note that for a single buffer instance, data can in parallel be written
by QUIC MUX and removed on ACK reception. This could cause room
notification to QUIC MUX layer to report invalid values. As such, ACK
reception are only accounted for released buffers. This ensures that
such buffers won't received any new data. In the same time, buffer room
is notified on release operation as it does not need acknowledgement.
This commit has permit to improve performance for the ngtcp2-client
scenario above. However, it is not yet sufficient enough for interop
goodput test.
2024-09-18 04:32:39 -04:00
|
|
|
if (stream->notify_room && room)
|
|
|
|
|
stream->notify_room(stream, room);
|
2022-04-15 11:29:25 -04:00
|
|
|
}
|