Skip to content

vllm.sequence

Sequence and its related classes.

LoRARequest module-attribute

LoRARequest = Any

VLLM_INVALID_TOKEN_ID module-attribute

VLLM_INVALID_TOKEN_ID = -1

VLLM_TOKEN_ID_ARRAY_TYPE module-attribute

VLLM_TOKEN_ID_ARRAY_TYPE = 'l'

ExecuteModelRequest

Bases: Struct

Source code in vllm/sequence.py
class ExecuteModelRequest(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True):  # type: ignore[call-arg]
    # Placeholder. Remove.
    pass

IntermediateTensors dataclass

For all pipeline stages except the last, we need to return the hidden states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request.

Each stage also needs to handle its own kv_connector_output.

Source code in vllm/sequence.py
@dataclass
class IntermediateTensors:
    """For all pipeline stages except the last, we need to return the hidden
    states and residuals to be sent to the next stage. This data structure
    contains the hidden states and residuals for a request.

    Each stage also needs to handle its own kv_connector_output.
    """

    tensors: dict[str, torch.Tensor]
    kv_connector_output: Optional[KVConnectorOutput]

    def __init__(self, tensors):
        # manually define this function, so that
        # Dynamo knows `IntermediateTensors()` comes from this file.
        # Otherwise, dataclass will generate this function by evaluating
        # a string, and we will lose the information about the source file.
        self.tensors = tensors

    def __getitem__(self, key: Union[str, slice]):
        if isinstance(key, str):
            return self.tensors[key]
        elif isinstance(key, slice):
            return self.__class__({k: v[key] for k, v in self.tensors.items()})

    def __setitem__(self, key: str, value: torch.Tensor):
        self.tensors[key] = value

    def items(self):
        return self.tensors.items()

    def __len__(self):
        return len(self.tensors)

    def __eq__(self, other: object):
        if not isinstance(other, self.__class__):
            return False
        if self.tensors.keys() != other.tensors.keys():
            return False
        return all(
            torch.equal(self.tensors[k], other.tensors[k])
            for k in self.tensors)

    def __repr__(self) -> str:
        return f"IntermediateTensors(tensors={self.tensors})"

kv_connector_output instance-attribute

kv_connector_output: Optional[KVConnectorOutput]

tensors instance-attribute

tensors: dict[str, Tensor] = tensors

__eq__

__eq__(other: object)
Source code in vllm/sequence.py
def __eq__(self, other: object):
    if not isinstance(other, self.__class__):
        return False
    if self.tensors.keys() != other.tensors.keys():
        return False
    return all(
        torch.equal(self.tensors[k], other.tensors[k])
        for k in self.tensors)

__getitem__

__getitem__(key: Union[str, slice])
Source code in vllm/sequence.py
def __getitem__(self, key: Union[str, slice]):
    if isinstance(key, str):
        return self.tensors[key]
    elif isinstance(key, slice):
        return self.__class__({k: v[key] for k, v in self.tensors.items()})

__init__

__init__(tensors)
Source code in vllm/sequence.py
def __init__(self, tensors):
    # manually define this function, so that
    # Dynamo knows `IntermediateTensors()` comes from this file.
    # Otherwise, dataclass will generate this function by evaluating
    # a string, and we will lose the information about the source file.
    self.tensors = tensors

__len__

__len__()
Source code in vllm/sequence.py
def __len__(self):
    return len(self.tensors)

__repr__

__repr__() -> str
Source code in vllm/sequence.py
def __repr__(self) -> str:
    return f"IntermediateTensors(tensors={self.tensors})"

__setitem__

__setitem__(key: str, value: Tensor)
Source code in vllm/sequence.py
def __setitem__(self, key: str, value: torch.Tensor):
    self.tensors[key] = value

items

items()
Source code in vllm/sequence.py
def items(self):
    return self.tensors.items()

PoolerOutput

Bases: Struct

The output from a pooling operation in the pooling model.

Source code in vllm/sequence.py
class PoolerOutput(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        array_like=True):  # type: ignore[call-arg]
    """The output from a pooling operation in the pooling model."""
    outputs: list[PoolingSequenceGroupOutput]

    def get_data_nbytes(self) -> int:
        return sum(o.get_data_nbytes() for o in self.outputs)

    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
        return self.outputs[idx]

    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
        self.outputs[idx] = value

    def __len__(self):
        return len(self.outputs)

    def __eq__(self, other: object):
        return isinstance(other,
                          self.__class__) and self.outputs == other.outputs

outputs instance-attribute

__eq__

__eq__(other: object)
Source code in vllm/sequence.py
def __eq__(self, other: object):
    return isinstance(other,
                      self.__class__) and self.outputs == other.outputs

__getitem__

__getitem__(idx: int) -> PoolingSequenceGroupOutput
Source code in vllm/sequence.py
def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
    return self.outputs[idx]

__len__

__len__()
Source code in vllm/sequence.py
def __len__(self):
    return len(self.outputs)

__setitem__

__setitem__(idx: int, value: PoolingSequenceGroupOutput)
Source code in vllm/sequence.py
def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
    self.outputs[idx] = value

get_data_nbytes

get_data_nbytes() -> int
Source code in vllm/sequence.py
def get_data_nbytes(self) -> int:
    return sum(o.get_data_nbytes() for o in self.outputs)

PoolingSequenceGroupOutput

Bases: Struct

The model output associated with a pooling sequence group.

Source code in vllm/sequence.py
class PoolingSequenceGroupOutput(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        array_like=True,  # type: ignore[call-arg]
):
    """The model output associated with a pooling sequence group."""
    # Annotated as Any to be compatible with msgspec
    # The actual type is in SequenceGroup.pooled_data
    data: Any

    def get_data_nbytes(self) -> int:
        data: torch.Tensor = self.data
        return data.nbytes

    def __repr__(self) -> str:
        return f"PoolingSequenceGroupOutput(data={self.data}"

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, PoolingSequenceGroupOutput):
            raise NotImplementedError()
        return self.data == other.data

data instance-attribute

data: Any

__eq__

__eq__(other: object) -> bool
Source code in vllm/sequence.py
def __eq__(self, other: object) -> bool:
    if not isinstance(other, PoolingSequenceGroupOutput):
        raise NotImplementedError()
    return self.data == other.data

__repr__

__repr__() -> str
Source code in vllm/sequence.py
def __repr__(self) -> str:
    return f"PoolingSequenceGroupOutput(data={self.data}"

get_data_nbytes

get_data_nbytes() -> int
Source code in vllm/sequence.py
def get_data_nbytes(self) -> int:
    data: torch.Tensor = self.data
    return data.nbytes

RequestMetrics dataclass

Metrics associated with a request.

Attributes:

Name Type Description
arrival_time float

The time when the request arrived.

first_scheduled_time Optional[float]

The time when the request was first scheduled.

first_token_time Optional[float]

The time when the first token was generated.

time_in_queue Optional[float]

The time the request spent in the queue.

finished_time Optional[float]

The time when the request was finished.

scheduler_time Optional[float]

The time spent in the scheduler when this request was being considered by the scheduler.

model_forward_time Optional[float]

The time spent in the model forward pass when this request was in the batch.

model_execute_time Optional[float]

The time spent in the model execute function. This will include model forward, block/sync across workers, cpu-gpu sync time and sampling time.

Source code in vllm/sequence.py
@dataclass
class RequestMetrics:
    """Metrics associated with a request.

    Attributes:
        arrival_time: The time when the request arrived.
        first_scheduled_time: The time when the request was first scheduled.
        first_token_time: The time when the first token was generated.
        time_in_queue: The time the request spent in the queue.
        finished_time: The time when the request was finished.
        scheduler_time: The time spent in the scheduler when this request was
                        being considered by the scheduler.
        model_forward_time: The time spent in the model forward pass when this
                            request was in the batch.
        model_execute_time: The time spent in the model execute function. This
                            will include model forward, block/sync across
                            workers, cpu-gpu sync time and sampling time.
    """
    arrival_time: float
    last_token_time: float
    first_scheduled_time: Optional[float]
    first_token_time: Optional[float]
    time_in_queue: Optional[float]
    finished_time: Optional[float] = None
    scheduler_time: Optional[float] = None
    model_forward_time: Optional[float] = None
    model_execute_time: Optional[float] = None

arrival_time instance-attribute

arrival_time: float

finished_time class-attribute instance-attribute

finished_time: Optional[float] = None

first_scheduled_time instance-attribute

first_scheduled_time: Optional[float]

first_token_time instance-attribute

first_token_time: Optional[float]

last_token_time instance-attribute

last_token_time: float

model_execute_time class-attribute instance-attribute

model_execute_time: Optional[float] = None

model_forward_time class-attribute instance-attribute

model_forward_time: Optional[float] = None

scheduler_time class-attribute instance-attribute

scheduler_time: Optional[float] = None

time_in_queue instance-attribute

time_in_queue: Optional[float]

__init__

__init__(
    arrival_time: float,
    last_token_time: float,
    first_scheduled_time: Optional[float],
    first_token_time: Optional[float],
    time_in_queue: Optional[float],
    finished_time: Optional[float] = None,
    scheduler_time: Optional[float] = None,
    model_forward_time: Optional[float] = None,
    model_execute_time: Optional[float] = None,
) -> None