@@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() {
1235
1235
1236
1236
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
1237
1237
void clip_image_f32_free (struct clip_image_f32 * img) { delete img; }
1238
- void clip_image_u8_batch_free (struct clip_image_u8_batch & batch) {
1239
- if (batch. size > 0 ) {
1240
- delete[] batch. data ;
1241
- batch. size = 0 ;
1238
+ void clip_image_u8_batch_free (struct clip_image_u8_batch * batch) {
1239
+ if (batch-> size > 0 ) {
1240
+ delete[] batch-> data ;
1241
+ batch-> size = 0 ;
1242
1242
}
1243
1243
}
1244
- void clip_image_f32_batch_free (struct clip_image_f32_batch & batch) {
1245
- if (batch. size > 0 ) {
1246
- delete[] batch. data ;
1247
- batch. size = 0 ;
1244
+ void clip_image_f32_batch_free (struct clip_image_f32_batch * batch) {
1245
+ if (batch-> size > 0 ) {
1246
+ delete[] batch-> data ;
1247
+ batch-> size = 0 ;
1248
1248
}
1249
1249
}
1250
1250
@@ -1497,7 +1497,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
1497
1497
1498
1498
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
1499
1499
// res_imgs memory is being allocated here, previous allocations will be freed if found
1500
- bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
1500
+ bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1501
1501
bool pad_to_square = true ;
1502
1502
if (!ctx->has_vision_encoder ) {
1503
1503
printf (" This gguf file seems to have no vision encoder\n " );
@@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1509
1509
pad_to_square = false ;
1510
1510
}
1511
1511
// free the previous res_imgs if any set
1512
- if (res_imgs. size > 0 ) {
1512
+ if (res_imgs-> size > 0 ) {
1513
1513
clip_image_f32_batch_free (res_imgs);
1514
1514
}
1515
- res_imgs. data = nullptr ;
1516
- res_imgs. size = 0 ;
1515
+ res_imgs-> data = nullptr ;
1516
+ res_imgs-> size = 0 ;
1517
1517
1518
1518
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
1519
1519
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1568
1568
bicubic_resize (*img, *image_original_resize, params.image_size , params.image_size ); // in python this is "shortest_edge", but all CLIP are square
1569
1569
patches.insert (patches.begin (), image_original_resize);
1570
1570
// clip_image_f32_batch_init(patches.size());
1571
- res_imgs. size = patches.size ();
1572
- res_imgs. data = new clip_image_f32[res_imgs. size ];
1571
+ res_imgs-> size = patches.size ();
1572
+ res_imgs-> data = new clip_image_f32[res_imgs-> size ];
1573
1573
int num=0 ;
1574
1574
for (auto & patch : patches) {
1575
- normalize_image_u8_to_f32 (patch, &res_imgs. data [num], ctx->image_mean , ctx->image_std );
1575
+ normalize_image_u8_to_f32 (patch, &res_imgs-> data [num], ctx->image_mean , ctx->image_std );
1576
1576
num++;
1577
1577
}
1578
1578
@@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1660
1660
// }
1661
1661
// res_imgs.push_back(res);
1662
1662
1663
- res_imgs. size = 1 ;
1664
- res_imgs. data = new clip_image_f32[res_imgs. size ];
1665
- res_imgs. data [0 ] = *res;
1663
+ res_imgs-> size = 1 ;
1664
+ res_imgs-> data = new clip_image_f32[res_imgs-> size ];
1665
+ res_imgs-> data [0 ] = *res;
1666
1666
clip_image_f32_free (res);
1667
1667
1668
1668
return true ;
0 commit comments