@@ -679,9 +679,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
679
679
}
680
680
681
681
clip_image_u8 * make_clip_image_u8 () { return new clip_image_u8 (); }
682
-
683
682
clip_image_f32 * make_clip_image_f32 () { return new clip_image_f32 (); }
684
683
684
+ void clip_image_u8_free (clip_image_u8 * img) { if (img->data ) { delete[] img->data ; } delete img; }
685
+ void clip_image_f32_free (clip_image_f32 * img) { if (img->data ) { delete[] img->data ; } delete img; }
686
+
685
687
static void build_clip_img_from_data (const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
686
688
img->nx = nx;
687
689
img->ny = ny;
@@ -726,39 +728,40 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
726
728
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
727
729
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
728
730
729
- clip_image_u8 temp; // we will keep the input image data here temporarily
731
+ clip_image_u8 * temp = make_clip_image_u8 () ; // we will keep the input image data here temporarily
730
732
if (pad2square && img->nx != img->ny ) {
731
733
int longer_side = std::max (img->nx , img->ny );
732
- temp. nx = longer_side;
733
- temp. ny = longer_side;
734
- temp. size = 3 * longer_side * longer_side;
735
- temp. data = new uint8_t [temp. size ]();
734
+ temp-> nx = longer_side;
735
+ temp-> ny = longer_side;
736
+ temp-> size = 3 * longer_side * longer_side;
737
+ temp-> data = new uint8_t [temp-> size ]();
736
738
uint8_t bc[3 ] = {122 , 116 , 104 }; // bakground color in RGB from LLaVA
737
739
738
740
// fill with background color
739
- for (size_t i = 0 ; i < temp. size ; i++) {
740
- temp. data [i] = bc[i % 3 ];
741
+ for (size_t i = 0 ; i < temp-> size ; i++) {
742
+ temp-> data [i] = bc[i % 3 ];
741
743
}
742
744
743
745
// copy from the input image
744
746
for (int y = 0 ; y < img->ny ; y++) {
745
747
for (int x = 0 ; x < img->nx ; x++) {
746
748
const int i = 3 * (y * img->nx + x);
747
- const int j = 3 * (y * temp. nx + x);
748
- temp. data [j] = img->data [i];
749
- temp. data [j+1 ] = img->data [i+1 ];
750
- temp. data [j+2 ] = img->data [i+2 ];
749
+ const int j = 3 * (y * temp-> nx + x);
750
+ temp-> data [j] = img->data [i];
751
+ temp-> data [j+1 ] = img->data [i+1 ];
752
+ temp-> data [j+2 ] = img->data [i+2 ];
751
753
}
752
754
}
753
755
} else {
754
- temp.nx = img->nx ;
755
- temp.ny = img->ny ;
756
- temp.size = img->size ;
757
- temp.data = img->data ;
756
+ temp->nx = img->nx ;
757
+ temp->ny = img->ny ;
758
+ temp->size = img->size ;
759
+ temp->data = new uint8_t [temp->size ]();
760
+ *temp->data = *img->data ; // copy
758
761
}
759
762
760
- const int nx = temp. nx ;
761
- const int ny = temp. ny ;
763
+ const int nx = temp-> nx ;
764
+ const int ny = temp-> ny ;
762
765
763
766
const int nx2 = ctx->vision_model .hparams .image_size ;
764
767
const int ny2 = ctx->vision_model .hparams .image_size ;
@@ -797,10 +800,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
797
800
const int j10 = 3 * (y1 * nx + x0) + c;
798
801
const int j11 = 3 * (y1 * nx + x1) + c;
799
802
800
- const float v00 = temp. data [j00];
801
- const float v01 = temp. data [j01];
802
- const float v10 = temp. data [j10];
803
- const float v11 = temp. data [j11];
803
+ const float v00 = temp-> data [j00];
804
+ const float v01 = temp-> data [j01];
805
+ const float v10 = temp-> data [j10];
806
+ const float v11 = temp-> data [j11];
804
807
805
808
const float v0 = v00 * (1 .0f - dx) + v01 * dx;
806
809
const float v1 = v10 * (1 .0f - dx) + v11 * dx;
@@ -815,6 +818,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
815
818
}
816
819
}
817
820
}
821
+ clip_image_u8_free (temp);
818
822
819
823
return true ;
820
824
}
0 commit comments