Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
95f8c97
bart compile
zucchini-nlp Dec 16, 2024
16eef87
add mbart
zucchini-nlp Dec 17, 2024
173eb69
some more models touched by fix-copies
zucchini-nlp Dec 17, 2024
a650eb8
more
zucchini-nlp Dec 17, 2024
fb9c7bf
more models
zucchini-nlp Dec 17, 2024
f8dbbd3
even more models
zucchini-nlp Dec 17, 2024
b7b6b4f
fix copies
zucchini-nlp Dec 17, 2024
0c1bfbb
fix tests
zucchini-nlp Dec 18, 2024
d6ea64a
fix copies
zucchini-nlp Dec 18, 2024
1b93312
fix
zucchini-nlp Dec 18, 2024
cda66f0
biogpt accepts position ids now (breaking?)
zucchini-nlp Dec 18, 2024
a33e663
fix failing non-slow tests
zucchini-nlp Jan 10, 2025
ffb734b
fix some tests
zucchini-nlp Jan 10, 2025
a01b959
should not be removed
zucchini-nlp Jan 10, 2025
75c01e9
Merge remote-tracking branch 'upstream/main' into bart-compile
zucchini-nlp Jan 13, 2025
490d0c2
small update
zucchini-nlp Jan 13, 2025
989b187
Update src/transformers/models/bart/modeling_bart.py
zucchini-nlp Jan 14, 2025
0ed7456
update for last `main`
zucchini-nlp Jan 15, 2025
29f1d71
Merge remote-tracking branch 'upstream/main' into bart-compile
zucchini-nlp Jan 15, 2025
3819ff5
fix copies
zucchini-nlp Jan 15, 2025
735b8af
clone `update_causal_mask` from llama
zucchini-nlp Jan 17, 2025
6324efc
tmp
zucchini-nlp Jan 20, 2025
7667cc2
Merge remote-tracking branch 'upstream/main' into bart-compile
zucchini-nlp May 15, 2025
a5f1763
fixup
zucchini-nlp May 15, 2025
2a93037
why? how?
zucchini-nlp May 15, 2025
d3f72cf
fix bart tests
zucchini-nlp May 15, 2025
7c5e22a
dont skip test
zucchini-nlp May 15, 2025
7c7c0a8
address comments
zucchini-nlp May 15, 2025
601fed1
fix tests
zucchini-nlp May 15, 2025
c9fbaca
fix
zucchini-nlp May 15, 2025
602e16f
fixup and delete the file
zucchini-nlp May 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/transformers/generation/candidate_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
if is_sklearn_available():
from sklearn.metrics import roc_curve

from ..cache_utils import DynamicCache
from ..cache_utils import Cache
from ..pytorch_utils import isin_mps_friendly
from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor, SuppressTokensLogitsProcessor

Expand Down Expand Up @@ -1183,7 +1183,9 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
def _crop_past_key_values(model, past_key_values, max_length):
"""Crops the past key values up to a certain maximum length."""
new_past = []
if model.config.is_encoder_decoder:
if isinstance(past_key_values, Cache):
past_key_values.crop(max_length)
elif model.config.is_encoder_decoder:
for idx in range(len(past_key_values)):
new_past.append(
(
Expand All @@ -1204,8 +1206,6 @@ def _crop_past_key_values(model, past_key_values, max_length):
else:
for idx in range(len(past_key_values)):
past_key_values[idx] = past_key_values[idx][:, :, :max_length, :]
elif isinstance(past_key_values, DynamicCache):
past_key_values.crop(max_length)
elif past_key_values is not None:
for idx in range(len(past_key_values)):
if past_key_values[idx] != ([], []):
Expand Down
15 changes: 9 additions & 6 deletions src/transformers/models/autoformer/modeling_autoformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,13 +370,16 @@ def _init_weight(self):
self.weight = nn.Parameter(out, requires_grad=False)

@torch.no_grad()
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
def forward(
self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
if position_ids is None:
bsz, seq_len = input_ids_shape[:2]
position_ids = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(position_ids)


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Autoformer
Expand Down
603 changes: 384 additions & 219 deletions src/transformers/models/bart/modeling_bart.py

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion src/transformers/models/big_bird/modeling_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -1340,7 +1340,6 @@ def set_attention_type(self, value: str):
attn_weights.value = self.self.value
attn_weights.key = self.self.key
self.self = attn_weights
self.attention_type = value
if not self.training:
self.self.eval()

Expand Down
399 changes: 293 additions & 106 deletions src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py

Large diffs are not rendered by default.

479 changes: 320 additions & 159 deletions src/transformers/models/biogpt/modeling_biogpt.py

Large diffs are not rendered by default.

386 changes: 287 additions & 99 deletions src/transformers/models/blenderbot/modeling_blenderbot.py

Large diffs are not rendered by default.

393 changes: 291 additions & 102 deletions src/transformers/models/blenderbot_small/modeling_blenderbot_small.py

Large diffs are not rendered by default.

25 changes: 16 additions & 9 deletions src/transformers/models/informer/modeling_informer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
from torch import nn

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_attn_mask_utils import (
_prepare_4d_attention_mask,
_prepare_4d_causal_attention_mask,
)
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Expand Down Expand Up @@ -251,13 +254,16 @@ def _init_weight(self):
self.weight = nn.Parameter(out, requires_grad=False)

@torch.no_grad()
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
def forward(
self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
if position_ids is None:
bsz, seq_len = input_ids_shape[:2]
position_ids = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(position_ids)


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Info
Expand All @@ -270,7 +276,7 @@ def forward(self, x):
return self.value_projection(x)


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
# Copied from transformers.models.hubert.modeling_hubert.HubertAttention with Hubert->Informer
class InformerAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

Expand Down Expand Up @@ -1045,7 +1051,6 @@ def forward(
)


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,time-series-transformer->informer,Transformer->Informer,TimeSeries->Informer
class InformerDecoder(InformerPreTrainedModel):
"""
Informer decoder consisting of *config.decoder_layers* layers. Each layer is a
Expand Down Expand Up @@ -1403,6 +1408,7 @@ def get_encoder(self):
def get_decoder(self):
return self.decoder

# Ignore copy
@auto_docstring
def forward(
self,
Expand Down Expand Up @@ -1654,6 +1660,7 @@ def output_distribution(self, params, loc=None, scale=None, trailing_n=None) ->
sliced_params = [p[:, -trailing_n:] for p in params]
return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)

# Ignore copy
@auto_docstring
def forward(
self,
Expand Down
Loading