From aac52940ade5c788bc7d8d6949da718b63293dc1 Mon Sep 17 00:00:00 2001 From: lingbai-kong Date: Fri, 23 Jun 2023 13:17:46 +0800 Subject: [PATCH 1/5] init pickle support to np.load object type of npy --- .../NumPy/DtypeConstructor.cs | 40 ++++++++++++ .../Implementation/NumPyImpl.Creation.cs | 18 +++++- .../NumPy/Implementation/NumPyImpl.load.cs | 22 +++++-- .../NumPy/MultiArrayConstructor.cs | 44 +++++++++++++ .../NumPy/NDArray.Pickle.cs | 19 ++++++ .../Tensorflow.Binding.csproj | 1 + src/TensorFlowNET.Keras/Datasets/Imdb.cs | 63 +++++++++++++++++-- .../Dataset/DatasetTest.cs | 17 +++++ 8 files changed, 215 insertions(+), 9 deletions(-) create mode 100644 src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs create mode 100644 src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs create mode 100644 src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs diff --git a/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs b/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs new file mode 100644 index 000000000..f84f408e1 --- /dev/null +++ b/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs @@ -0,0 +1,40 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Text; +using Razorvine.Pickle; + +namespace Tensorflow.NumPy +{ + /// + /// + /// + [SuppressMessage("ReSharper", "InconsistentNaming")] + [SuppressMessage("ReSharper", "MemberCanBePrivate.Global")] + [SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")] + class DtypeConstructor : IObjectConstructor + { + public object construct(object[] args) + { + Console.WriteLine("DtypeConstructor"); + Console.WriteLine(args.Length); + for (int i = 0; i < args.Length; i++) + { + Console.WriteLine(args[i]); + } + return new demo(); + } + } + class demo + { + public void __setstate__(object[] args) + { + Console.WriteLine("demo __setstate__"); + Console.WriteLine(args.Length); + for (int i = 0; i < args.Length; i++) + { + Console.WriteLine(args[i]); + } + } + } +} diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs index f29879b0f..80b62198a 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs @@ -4,6 +4,7 @@ using System.Linq; using System.Text; using Tensorflow.Util; +using Razorvine.Pickle; using static Tensorflow.Binding; namespace Tensorflow.NumPy @@ -93,10 +94,25 @@ Array ReadValueMatrix(BinaryReader reader, Array matrix, int bytes, Type type, i var buffer = reader.ReadBytes(bytes * total); System.Buffer.BlockCopy(buffer, 0, matrix, 0, buffer.Length); - return matrix; } + NDArray ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape) + { + //int data = reader.ReadByte(); + //Console.WriteLine(data); + //Console.WriteLine(reader.ReadByte()); + Stream stream = reader.BaseStream; + Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor()); + Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor()); + + var unpickler = new Unpickler(); + + NDArray result = (NDArray) unpickler.load(stream); + Console.WriteLine(result.dims); + return result; + } + public (NDArray, NDArray) meshgrid(T[] array, bool copy = true, bool sparse = false) { var tensors = array_ops.meshgrid(array, copy: copy, sparse: sparse); diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs index 05f53d5e7..789f119a1 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs @@ -27,9 +27,20 @@ public Array LoadMatrix(Stream stream) Array matrix = Array.CreateInstance(type, shape); //if (type == typeof(String)) - //return ReadStringMatrix(reader, matrix, bytes, type, shape); + //return ReadStringMatrix(reader, matrix, bytes, type, shape); + NDArray res = ReadObjectMatrix(reader, matrix, shape); + Console.WriteLine("LoadMatrix"); + Console.WriteLine(res.dims[0]); + Console.WriteLine((int)res[0][0]); + Console.WriteLine(res.dims[1]); + //if (type == typeof(Object)) + //{ + + //} + //else return ReadValueMatrix(reader, matrix, bytes, type, shape); } + } public T Load(Stream stream) @@ -37,7 +48,7 @@ public T Load(Stream stream) ICloneable, IList, ICollection, IEnumerable, IStructuralComparable, IStructuralEquatable { // if (typeof(T).IsArray && (typeof(T).GetElementType().IsArray || typeof(T).GetElementType() == typeof(string))) - // return LoadJagged(stream) as T; + // return LoadJagged(stream) as T; return LoadMatrix(stream) as T; } @@ -48,7 +59,7 @@ bool ParseReader(BinaryReader reader, out int bytes, out Type t, out int[] shape shape = null; // The first 6 bytes are a magic string: exactly "x93NUMPY" - if (reader.ReadChar() != 63) return false; + if (reader.ReadByte() != 0x93) return false; if (reader.ReadChar() != 'N') return false; if (reader.ReadChar() != 'U') return false; if (reader.ReadChar() != 'M') return false; @@ -64,6 +75,7 @@ bool ParseReader(BinaryReader reader, out int bytes, out Type t, out int[] shape ushort len = reader.ReadUInt16(); string header = new String(reader.ReadChars(len)); + Console.WriteLine(header); string mark = "'descr': '"; int s = header.IndexOf(mark) + mark.Length; int e = header.IndexOf("'", s + 1); @@ -93,7 +105,7 @@ bool ParseReader(BinaryReader reader, out int bytes, out Type t, out int[] shape Type GetType(string dtype, out int bytes, out bool? isLittleEndian) { isLittleEndian = IsLittleEndian(dtype); - bytes = Int32.Parse(dtype.Substring(2)); + bytes = dtype.Length > 2 ? Int32.Parse(dtype.Substring(2)) : 0; string typeCode = dtype.Substring(1); @@ -121,6 +133,8 @@ Type GetType(string dtype, out int bytes, out bool? isLittleEndian) return typeof(Double); if (typeCode.StartsWith("S")) return typeof(String); + if (typeCode == "O") + return typeof(Object); throw new NotSupportedException(); } diff --git a/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs b/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs new file mode 100644 index 000000000..92927cd5a --- /dev/null +++ b/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs @@ -0,0 +1,44 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Text; +using Razorvine.Pickle; + +namespace Tensorflow.NumPy +{ + /// + /// Creates multiarrays of objects. Returns a primitive type multiarray such as int[][] if + /// the objects are ints, etc. + /// + [SuppressMessage("ReSharper", "InconsistentNaming")] + [SuppressMessage("ReSharper", "MemberCanBePrivate.Global")] + [SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")] + public class MultiArrayConstructor : IObjectConstructor + { + public object construct(object[] args) + { + //Console.WriteLine(args.Length); + //for (int i = 0; i < args.Length; i++) + //{ + // Console.WriteLine(args[i]); + //} + Console.WriteLine("MultiArrayConstructor"); + + var arg1 = (Object[])args[1]; + var dims = new int[arg1.Length]; + for (var i = 0; i < arg1.Length; i++) + { + dims[i] = (int)arg1[i]; + } + + var dtype = TF_DataType.DtInvalid; + switch (args[2]) + { + case "b": dtype = TF_DataType.DtUint8Ref; break; + default: throw new NotImplementedException("cannot parse" + args[2]); + } + return new NDArray(new Shape(dims), dtype); + + } + } +} diff --git a/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs b/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs new file mode 100644 index 000000000..b4d66243a --- /dev/null +++ b/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Tensorflow.NumPy +{ + public partial class NDArray + { + public void __setstate__(object[] args) + { + Console.WriteLine("NDArray __setstate__"); + Console.WriteLine(args.Length); + for (int i = 0; i < args.Length; i++) + { + Console.WriteLine(args[i]); + } + } + } +} diff --git a/src/TensorFlowNET.Core/Tensorflow.Binding.csproj b/src/TensorFlowNET.Core/Tensorflow.Binding.csproj index 09f5b0770..38778c3fe 100644 --- a/src/TensorFlowNET.Core/Tensorflow.Binding.csproj +++ b/src/TensorFlowNET.Core/Tensorflow.Binding.csproj @@ -112,6 +112,7 @@ https://tensorflownet.readthedocs.io + diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs index 56b0d2a77..016b352d9 100644 --- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs +++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs @@ -5,6 +5,13 @@ using Tensorflow.Keras.Utils; using Tensorflow.NumPy; using System.Linq; +using Google.Protobuf.Collections; +using Microsoft.VisualBasic; +using OneOf.Types; +using static HDF.PInvoke.H5; +using System.Data; +using System.Reflection.Emit; +using System.Xml.Linq; namespace Tensorflow.Keras.Datasets { @@ -12,13 +19,59 @@ namespace Tensorflow.Keras.Datasets /// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment /// (positive/negative). Reviews have been preprocessed, and each review is /// encoded as a list of word indexes(integers). + /// For convenience, words are indexed by overall frequency in the dataset, + /// so that for instance the integer "3" encodes the 3rd most frequent word in + /// the data.This allows for quick filtering operations such as: + /// "only consider the top 10,000 most + /// common words, but eliminate the top 20 most common words". + /// As a convention, "0" does not stand for a specific word, but instead is used + /// to encode the pad token. + /// Args: + /// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz). + /// num_words: integer or None.Words are + /// ranked by how often they occur(in the training set) and only + /// the `num_words` most frequent words are kept.Any less frequent word + /// will appear as `oov_char` value in the sequence data.If None, + /// all words are kept.Defaults to `None`. + /// skip_top: skip the top N most frequently occurring words + /// (which may not be informative). These words will appear as + /// `oov_char` value in the dataset.When 0, no words are + /// skipped. Defaults to `0`. + /// maxlen: int or None.Maximum sequence length. + /// Any longer sequence will be truncated. None, means no truncation. + /// Defaults to `None`. + /// seed: int. Seed for reproducible data shuffling. + /// start_char: int. The start of a sequence will be marked with this + /// character. 0 is usually the padding character. Defaults to `1`. + /// oov_char: int. The out-of-vocabulary character. + /// Words that were cut out because of the `num_words` or + /// `skip_top` limits will be replaced with this character. + /// index_from: int. Index actual words with this index and higher. + /// Returns: + /// Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. + /// + /// ** x_train, x_test**: lists of sequences, which are lists of indexes + /// (integers). If the num_words argument was specific, the maximum + /// possible index value is `num_words - 1`. If the `maxlen` argument was + /// specified, the largest possible sequence length is `maxlen`. + /// + /// ** y_train, y_test**: lists of integer labels(1 or 0). + /// + /// Raises: + /// ValueError: in case `maxlen` is so low + /// that no input sequence could be kept. + /// Note that the 'out of vocabulary' character is only used for + /// words that were present in the training set but are not included + /// because they're not making the `num_words` cut here. + /// Words that were not seen in the training set but are in the test set + /// have simply been skipped. /// + /// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). public class Imdb { string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; string file_name = "imdb.npz"; string dest_folder = "imdb"; - /// /// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). /// @@ -41,8 +94,10 @@ public DatasetPass load_data(string path = "imdb.npz", int index_from = 3) { var dst = Download(); - - var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); + var fileBytes = File.ReadAllBytes(Path.Combine(dst, file_name)); + var (x_train, x_test) = LoadX(fileBytes); + var (y_train, y_test) = LoadY(fileBytes); + /*var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); var x_train_string = new string[lines.Length]; var y_train = np.zeros(new int[] { lines.Length }, np.int64); for (int i = 0; i < lines.Length; i++) @@ -62,7 +117,7 @@ public DatasetPass load_data(string path = "imdb.npz", x_test_string[i] = lines[i].Substring(2); } - var x_test = np.array(x_test_string); + var x_test = np.array(x_test_string);*/ return new DatasetPass { diff --git a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs index 8317346ea..778290bb8 100644 --- a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs +++ b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs @@ -1,7 +1,9 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; using System; +using System.Collections.Generic; using System.Linq; using static Tensorflow.Binding; +using static Tensorflow.KerasApi; namespace TensorFlowNET.UnitTest.Dataset { @@ -195,5 +197,20 @@ public void Shuffle() Assert.IsFalse(allEqual); } + [TestMethod] + public void GetData() + { + var vocab_size = 20000; // Only consider the top 20k words + var maxlen = 200; // Only consider the first 200 words of each movie review + var dataset = keras.datasets.imdb.load_data(num_words: vocab_size); + var x_train = dataset.Train.Item1; + var y_train = dataset.Train.Item2; + var x_val = dataset.Test.Item1; + var y_val = dataset.Test.Item2; + print(len(x_train) + "Training sequences"); + print(len(x_val) + "Validation sequences"); + x_train = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_train, maxlen: maxlen); + x_val = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_val, maxlen: maxlen); + } } } From 9d10daf30f02ebf078d56aadca59cc269ae23b4d Mon Sep 17 00:00:00 2001 From: lingbai-kong Date: Wed, 6 Sep 2023 23:12:00 +0800 Subject: [PATCH 2/5] add reconstruction and setstate of NDArray for loading pickled npy file. --- .../NumPy/DtypeConstructor.cs | 55 ++++++++--- .../Implementation/NumPyImpl.Creation.cs | 3 - .../NumPy/Implementation/NumPyImpl.load.cs | 24 ++--- .../NumPy/MultiArrayConstructor.cs | 35 ++++--- .../NumPy/NDArray.Pickle.cs | 99 ++++++++++++++++++- .../NumPy/NDArrayConverter.cs | 1 + src/TensorFlowNET.Core/Numpy/Numpy.cs | 4 +- src/TensorFlowNET.Keras/Datasets/Imdb.cs | 10 +- 8 files changed, 178 insertions(+), 53 deletions(-) diff --git a/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs b/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs index f84f408e1..30ef82df4 100644 --- a/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs +++ b/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs @@ -16,25 +16,50 @@ class DtypeConstructor : IObjectConstructor { public object construct(object[] args) { - Console.WriteLine("DtypeConstructor"); - Console.WriteLine(args.Length); - for (int i = 0; i < args.Length; i++) - { - Console.WriteLine(args[i]); - } - return new demo(); + var typeCode = (string)args[0]; + TF_DataType dtype; + if (typeCode == "b1") + dtype = np.@bool; + else if (typeCode == "i1") + dtype = np.@byte; + else if (typeCode == "i2") + dtype = np.int16; + else if (typeCode == "i4") + dtype = np.int32; + else if (typeCode == "i8") + dtype = np.int64; + else if (typeCode == "u1") + dtype = np.ubyte; + else if (typeCode == "u2") + dtype = np.uint16; + else if (typeCode == "u4") + dtype = np.uint32; + else if (typeCode == "u8") + dtype = np.uint64; + else if (typeCode == "f4") + dtype = np.float32; + else if (typeCode == "f8") + dtype = np.float64; + else if (typeCode.StartsWith("S")) + dtype = np.@string; + else if (typeCode.StartsWith("O")) + dtype = np.@object; + else + throw new NotSupportedException(); + return new TF_DataType_Warpper(dtype); } } - class demo + public class TF_DataType_Warpper { - public void __setstate__(object[] args) + TF_DataType dtype { get; set; } + public TF_DataType_Warpper(TF_DataType dtype) { - Console.WriteLine("demo __setstate__"); - Console.WriteLine(args.Length); - for (int i = 0; i < args.Length; i++) - { - Console.WriteLine(args[i]); - } + this.dtype = dtype; + } + public void __setstate__(object[] args) { } + public static implicit operator TF_DataType(TF_DataType_Warpper dtypeWarpper) + { + return dtypeWarpper.dtype; } } } diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs index 80b62198a..7b79f83c6 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs @@ -99,9 +99,6 @@ Array ReadValueMatrix(BinaryReader reader, Array matrix, int bytes, Type type, i NDArray ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape) { - //int data = reader.ReadByte(); - //Console.WriteLine(data); - //Console.WriteLine(reader.ReadByte()); Stream stream = reader.BaseStream; Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor()); Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor()); diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs index 789f119a1..bbe48e6a4 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs @@ -28,17 +28,17 @@ public Array LoadMatrix(Stream stream) //if (type == typeof(String)) //return ReadStringMatrix(reader, matrix, bytes, type, shape); - NDArray res = ReadObjectMatrix(reader, matrix, shape); - Console.WriteLine("LoadMatrix"); - Console.WriteLine(res.dims[0]); - Console.WriteLine((int)res[0][0]); - Console.WriteLine(res.dims[1]); - //if (type == typeof(Object)) - //{ - - //} - //else - return ReadValueMatrix(reader, matrix, bytes, type, shape); + + if (type == typeof(Object)) + { + NDArray res = ReadObjectMatrix(reader, matrix, shape); + // res = res.reconstructedNDArray; + return res.reconstructedArray; + } + else + { + return ReadValueMatrix(reader, matrix, bytes, type, shape); + } } } @@ -133,7 +133,7 @@ Type GetType(string dtype, out int bytes, out bool? isLittleEndian) return typeof(Double); if (typeCode.StartsWith("S")) return typeof(String); - if (typeCode == "O") + if (typeCode.StartsWith("O")) return typeof(Object); throw new NotSupportedException(); diff --git a/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs b/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs index 92927cd5a..43eda23e0 100644 --- a/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs +++ b/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs @@ -3,6 +3,7 @@ using System.Diagnostics.CodeAnalysis; using System.Text; using Razorvine.Pickle; +using Razorvine.Pickle.Objects; namespace Tensorflow.NumPy { @@ -17,28 +18,36 @@ public class MultiArrayConstructor : IObjectConstructor { public object construct(object[] args) { - //Console.WriteLine(args.Length); - //for (int i = 0; i < args.Length; i++) - //{ - // Console.WriteLine(args[i]); - //} - Console.WriteLine("MultiArrayConstructor"); - + if (args.Length != 3) + throw new InvalidArgumentError($"Invalid number of arguments in MultiArrayConstructor._reconstruct. Expected three arguments. Given {args.Length} arguments."); + + var types = (ClassDictConstructor)args[0]; + if (types.module != "numpy" || types.name != "ndarray") + throw new RuntimeError("_reconstruct: First argument must be a sub-type of ndarray"); + var arg1 = (Object[])args[1]; var dims = new int[arg1.Length]; for (var i = 0; i < arg1.Length; i++) { dims[i] = (int)arg1[i]; } + var shape = new Shape(dims); - var dtype = TF_DataType.DtInvalid; - switch (args[2]) + TF_DataType dtype; + string identifier; + if (args[2].GetType() == typeof(string)) + identifier = (string)args[2]; + else + identifier = Encoding.UTF8.GetString((byte[])args[2]); + switch (identifier) { - case "b": dtype = TF_DataType.DtUint8Ref; break; - default: throw new NotImplementedException("cannot parse" + args[2]); + case "u": dtype = np.uint32; break; + case "c": dtype = np.complex_; break; + case "f": dtype = np.float32; break; + case "b": dtype = np.@bool; break; + default: throw new NotImplementedException($"Unsupported data type: {args[2]}"); } - return new NDArray(new Shape(dims), dtype); - + return new NDArray(shape, dtype); } } } diff --git a/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs b/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs index b4d66243a..62720826a 100644 --- a/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs +++ b/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs @@ -1,4 +1,7 @@ -using System; +using Newtonsoft.Json.Linq; +using Serilog.Debugging; +using System; +using System.Collections; using System.Collections.Generic; using System.Text; @@ -6,14 +9,100 @@ namespace Tensorflow.NumPy { public partial class NDArray { + public NDArray reconstructedNDArray { get; set; } + public Array reconstructedArray { get; set; } public void __setstate__(object[] args) { - Console.WriteLine("NDArray __setstate__"); - Console.WriteLine(args.Length); - for (int i = 0; i < args.Length; i++) + if (args.Length != 5) + throw new InvalidArgumentError($"Invalid number of arguments in NDArray.__setstate__. Expected five arguments. Given {args.Length} arguments."); + + var version = (int)args[0]; // version + + var arg1 = (Object[])args[1]; + var dims = new int[arg1.Length]; + for (var i = 0; i < arg1.Length; i++) + { + dims[i] = (int)arg1[i]; + } + var _ShapeLike = new Shape(dims); // shape + + TF_DataType _DType_co = (TF_DataType_Warpper)args[2]; // DType + + var F_continuous = (bool)args[3]; // F-continuous + if (F_continuous) + throw new InvalidArgumentError("Fortran Continuous memory layout is not supported. Please use C-continuous layout or check the data format."); + + var data = args[4]; // Data + /* + * If we ever need another pickle format, increment the version + * number. But we should still be able to handle the old versions. + */ + if (version < 0 || version > 4) + throw new ValueError($"can't handle version {version} of numpy.dtype pickle"); + + // TODO: Implement the missing details and checks from the official Numpy C code here. + // https://github.com/numpy/numpy/blob/2f0bd6e86a77e4401d0384d9a75edf9470c5deb6/numpy/core/src/multiarray/descriptor.c#L2761 + + if (data.GetType() == typeof(ArrayList)) + { + SetState((ArrayList)data); + } + else + throw new NotImplementedException(""); + } + private void SetState(ArrayList arrayList) + { + int ndim = 1; + var subArrayList = arrayList; + while (subArrayList.Count > 0 && subArrayList[0] != null && subArrayList[0].GetType() == typeof(ArrayList)) + { + subArrayList = (ArrayList)subArrayList[0]; + ndim += 1; + } + var type = subArrayList[0].GetType(); + if (type == typeof(int)) { - Console.WriteLine(args[i]); + if (ndim == 1) + { + int[] list = (int[])arrayList.ToArray(typeof(int)); + Shape shape = new Shape(new int[] { arrayList.Count }); + reconstructedArray = list; + reconstructedNDArray = new NDArray(list, shape); + //SetData(new[] { new Slice() }, new NDArray(list, shape)); + //set_shape(shape); + } + if (ndim == 2) + { + int secondDim = 0; + foreach (ArrayList subArray in arrayList) + { + secondDim = subArray.Count > secondDim ? subArray.Count : secondDim; + } + int[,] list = new int[arrayList.Count, secondDim]; + for (int i = 0; i < arrayList.Count; i++) + { + var subArray = (ArrayList?)arrayList[i]; + if (subArray == null) + throw new NullReferenceException(""); + for (int j = 0; j < subArray.Count; j++) + { + var element = subArray[j]; + if (element == null) + throw new NoNullAllowedException("the element of ArrayList cannot be null."); + list[i,j] = (int) element; + } + } + Shape shape = new Shape(new int[] { arrayList.Count, secondDim }); + reconstructedArray = list; + reconstructedNDArray = new NDArray(list, shape); + //SetData(new[] { new Slice() }, new NDArray(list, shape)); + //set_shape(shape); + } + if (ndim > 2) + throw new NotImplementedException("can't handle ArrayList with more than two dimensions."); } + else + throw new NotImplementedException(""); } } } diff --git a/src/TensorFlowNET.Core/NumPy/NDArrayConverter.cs b/src/TensorFlowNET.Core/NumPy/NDArrayConverter.cs index c8c2d45fa..4c64eba74 100644 --- a/src/TensorFlowNET.Core/NumPy/NDArrayConverter.cs +++ b/src/TensorFlowNET.Core/NumPy/NDArrayConverter.cs @@ -10,6 +10,7 @@ public class NDArrayConverter public unsafe static T Scalar(NDArray nd) where T : unmanaged => nd.dtype switch { + TF_DataType.TF_BOOL => Scalar(*(bool*)nd.data), TF_DataType.TF_UINT8 => Scalar(*(byte*)nd.data), TF_DataType.TF_FLOAT => Scalar(*(float*)nd.data), TF_DataType.TF_INT32 => Scalar(*(int*)nd.data), diff --git a/src/TensorFlowNET.Core/Numpy/Numpy.cs b/src/TensorFlowNET.Core/Numpy/Numpy.cs index 72d2e981c..fee2d63fc 100644 --- a/src/TensorFlowNET.Core/Numpy/Numpy.cs +++ b/src/TensorFlowNET.Core/Numpy/Numpy.cs @@ -43,7 +43,9 @@ public partial class np public static readonly TF_DataType @decimal = TF_DataType.TF_DOUBLE; public static readonly TF_DataType complex_ = TF_DataType.TF_COMPLEX; public static readonly TF_DataType complex64 = TF_DataType.TF_COMPLEX64; - public static readonly TF_DataType complex128 = TF_DataType.TF_COMPLEX128; + public static readonly TF_DataType complex128 = TF_DataType.TF_COMPLEX128; + public static readonly TF_DataType @string = TF_DataType.TF_STRING; + public static readonly TF_DataType @object = TF_DataType.TF_VARIANT; #endregion public static double nan => double.NaN; diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs index 016b352d9..6808035c6 100644 --- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs +++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs @@ -70,7 +70,7 @@ namespace Tensorflow.Keras.Datasets public class Imdb { string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; - string file_name = "imdb.npz"; + string file_name = "simple.npz"; string dest_folder = "imdb"; /// /// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). @@ -128,13 +128,15 @@ public DatasetPass load_data(string path = "imdb.npz", (NDArray, NDArray) LoadX(byte[] bytes) { - var y = np.Load_Npz(bytes); - return (y["x_train.npy"], y["x_test.npy"]); + var y = np.Load_Npz(bytes); + var x_train = y["x_train.npy"]; + var x_test = y["x_test.npy"]; + return (x_train, x_test); } (NDArray, NDArray) LoadY(byte[] bytes) { - var y = np.Load_Npz(bytes); + var y = np.Load_Npz(bytes); return (y["y_train.npy"], y["y_test.npy"]); } From ea978bbf214a75ead94c568755255a6f3c6fed58 Mon Sep 17 00:00:00 2001 From: lingbai-kong Date: Thu, 7 Sep 2023 21:33:29 +0800 Subject: [PATCH 3/5] optimize code structure of reconstruction ndarray from pickled npy file --- .../Implementation/NumPyImpl.Creation.cs | 12 ++---- .../NumPy/Implementation/NumPyImpl.load.cs | 10 +---- .../NumPy/Pickle/DTypePickleWarpper.cs | 20 ++++++++++ .../NumPy/{ => Pickle}/DtypeConstructor.cs | 17 +------- .../{ => Pickle}/MultiArrayConstructor.cs | 14 +++---- .../MultiArrayPickleWarpper.cs} | 39 ++++++++++++------- src/TensorFlowNET.Core/tensorflow.cs | 6 +++ src/TensorFlowNET.Keras/Datasets/Imdb.cs | 19 +++------ .../Dataset/DatasetTest.cs | 6 +-- 9 files changed, 75 insertions(+), 68 deletions(-) create mode 100644 src/TensorFlowNET.Core/NumPy/Pickle/DTypePickleWarpper.cs rename src/TensorFlowNET.Core/NumPy/{ => Pickle}/DtypeConstructor.cs (77%) rename src/TensorFlowNET.Core/NumPy/{ => Pickle}/MultiArrayConstructor.cs (91%) rename src/TensorFlowNET.Core/NumPy/{NDArray.Pickle.cs => Pickle/MultiArrayPickleWarpper.cs} (77%) diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs index 7b79f83c6..fa4ef0191 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs @@ -5,6 +5,7 @@ using System.Text; using Tensorflow.Util; using Razorvine.Pickle; +using Tensorflow.NumPy.Pickle; using static Tensorflow.Binding; namespace Tensorflow.NumPy @@ -94,20 +95,15 @@ Array ReadValueMatrix(BinaryReader reader, Array matrix, int bytes, Type type, i var buffer = reader.ReadBytes(bytes * total); System.Buffer.BlockCopy(buffer, 0, matrix, 0, buffer.Length); + return matrix; } - NDArray ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape) + Array ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape) { Stream stream = reader.BaseStream; - Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor()); - Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor()); - var unpickler = new Unpickler(); - - NDArray result = (NDArray) unpickler.load(stream); - Console.WriteLine(result.dims); - return result; + return (MultiArrayPickleWarpper)unpickler.load(stream); } public (NDArray, NDArray) meshgrid(T[] array, bool copy = true, bool sparse = false) diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs index bbe48e6a4..199e5ced3 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs @@ -30,17 +30,12 @@ public Array LoadMatrix(Stream stream) //return ReadStringMatrix(reader, matrix, bytes, type, shape); if (type == typeof(Object)) - { - NDArray res = ReadObjectMatrix(reader, matrix, shape); - // res = res.reconstructedNDArray; - return res.reconstructedArray; - } + return ReadObjectMatrix(reader, matrix, shape); else { return ReadValueMatrix(reader, matrix, bytes, type, shape); } } - } public T Load(Stream stream) @@ -59,7 +54,7 @@ bool ParseReader(BinaryReader reader, out int bytes, out Type t, out int[] shape shape = null; // The first 6 bytes are a magic string: exactly "x93NUMPY" - if (reader.ReadByte() != 0x93) return false; + if (reader.ReadChar() != 63) return false; if (reader.ReadChar() != 'N') return false; if (reader.ReadChar() != 'U') return false; if (reader.ReadChar() != 'M') return false; @@ -75,7 +70,6 @@ bool ParseReader(BinaryReader reader, out int bytes, out Type t, out int[] shape ushort len = reader.ReadUInt16(); string header = new String(reader.ReadChars(len)); - Console.WriteLine(header); string mark = "'descr': '"; int s = header.IndexOf(mark) + mark.Length; int e = header.IndexOf("'", s + 1); diff --git a/src/TensorFlowNET.Core/NumPy/Pickle/DTypePickleWarpper.cs b/src/TensorFlowNET.Core/NumPy/Pickle/DTypePickleWarpper.cs new file mode 100644 index 000000000..5dff6c16b --- /dev/null +++ b/src/TensorFlowNET.Core/NumPy/Pickle/DTypePickleWarpper.cs @@ -0,0 +1,20 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Tensorflow.NumPy.Pickle +{ + public class DTypePickleWarpper + { + TF_DataType dtype { get; set; } + public DTypePickleWarpper(TF_DataType dtype) + { + this.dtype = dtype; + } + public void __setstate__(object[] args) { } + public static implicit operator TF_DataType(DTypePickleWarpper dTypeWarpper) + { + return dTypeWarpper.dtype; + } + } +} diff --git a/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs b/src/TensorFlowNET.Core/NumPy/Pickle/DtypeConstructor.cs similarity index 77% rename from src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs rename to src/TensorFlowNET.Core/NumPy/Pickle/DtypeConstructor.cs index 30ef82df4..160c7d4e9 100644 --- a/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs +++ b/src/TensorFlowNET.Core/NumPy/Pickle/DtypeConstructor.cs @@ -4,7 +4,7 @@ using System.Text; using Razorvine.Pickle; -namespace Tensorflow.NumPy +namespace Tensorflow.NumPy.Pickle { /// /// @@ -46,20 +46,7 @@ public object construct(object[] args) dtype = np.@object; else throw new NotSupportedException(); - return new TF_DataType_Warpper(dtype); - } - } - public class TF_DataType_Warpper - { - TF_DataType dtype { get; set; } - public TF_DataType_Warpper(TF_DataType dtype) - { - this.dtype = dtype; - } - public void __setstate__(object[] args) { } - public static implicit operator TF_DataType(TF_DataType_Warpper dtypeWarpper) - { - return dtypeWarpper.dtype; + return new DTypePickleWarpper(dtype); } } } diff --git a/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs b/src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayConstructor.cs similarity index 91% rename from src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs rename to src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayConstructor.cs index 43eda23e0..885f368c4 100644 --- a/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs +++ b/src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayConstructor.cs @@ -5,7 +5,7 @@ using Razorvine.Pickle; using Razorvine.Pickle.Objects; -namespace Tensorflow.NumPy +namespace Tensorflow.NumPy.Pickle { /// /// Creates multiarrays of objects. Returns a primitive type multiarray such as int[][] if @@ -18,14 +18,14 @@ public class MultiArrayConstructor : IObjectConstructor { public object construct(object[] args) { - if (args.Length != 3) + if (args.Length != 3) throw new InvalidArgumentError($"Invalid number of arguments in MultiArrayConstructor._reconstruct. Expected three arguments. Given {args.Length} arguments."); - + var types = (ClassDictConstructor)args[0]; - if (types.module != "numpy" || types.name != "ndarray") + if (types.module != "numpy" || types.name != "ndarray") throw new RuntimeError("_reconstruct: First argument must be a sub-type of ndarray"); - - var arg1 = (Object[])args[1]; + + var arg1 = (object[])args[1]; var dims = new int[arg1.Length]; for (var i = 0; i < arg1.Length; i++) { @@ -47,7 +47,7 @@ public object construct(object[] args) case "b": dtype = np.@bool; break; default: throw new NotImplementedException($"Unsupported data type: {args[2]}"); } - return new NDArray(shape, dtype); + return new MultiArrayPickleWarpper(shape, dtype); } } } diff --git a/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs b/src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayPickleWarpper.cs similarity index 77% rename from src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs rename to src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayPickleWarpper.cs index 62720826a..af8d1ecc2 100644 --- a/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs +++ b/src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayPickleWarpper.cs @@ -5,12 +5,19 @@ using System.Collections.Generic; using System.Text; -namespace Tensorflow.NumPy +namespace Tensorflow.NumPy.Pickle { - public partial class NDArray + public class MultiArrayPickleWarpper { + public Shape reconstructedShape { get; set; } + public TF_DataType reconstructedDType { get; set; } public NDArray reconstructedNDArray { get; set; } - public Array reconstructedArray { get; set; } + public Array reconstructedMultiArray { get; set; } + public MultiArrayPickleWarpper(Shape shape, TF_DataType dtype) + { + reconstructedShape = shape; + reconstructedDType = dtype; + } public void __setstate__(object[] args) { if (args.Length != 5) @@ -18,7 +25,7 @@ public void __setstate__(object[] args) var version = (int)args[0]; // version - var arg1 = (Object[])args[1]; + var arg1 = (object[])args[1]; var dims = new int[arg1.Length]; for (var i = 0; i < arg1.Length; i++) { @@ -26,7 +33,7 @@ public void __setstate__(object[] args) } var _ShapeLike = new Shape(dims); // shape - TF_DataType _DType_co = (TF_DataType_Warpper)args[2]; // DType + TF_DataType _DType_co = (DTypePickleWarpper)args[2]; // DType var F_continuous = (bool)args[3]; // F-continuous if (F_continuous) @@ -45,12 +52,12 @@ public void __setstate__(object[] args) if (data.GetType() == typeof(ArrayList)) { - SetState((ArrayList)data); + Reconstruct((ArrayList)data); } else throw new NotImplementedException(""); } - private void SetState(ArrayList arrayList) + private void Reconstruct(ArrayList arrayList) { int ndim = 1; var subArrayList = arrayList; @@ -66,10 +73,8 @@ private void SetState(ArrayList arrayList) { int[] list = (int[])arrayList.ToArray(typeof(int)); Shape shape = new Shape(new int[] { arrayList.Count }); - reconstructedArray = list; + reconstructedMultiArray = list; reconstructedNDArray = new NDArray(list, shape); - //SetData(new[] { new Slice() }, new NDArray(list, shape)); - //set_shape(shape); } if (ndim == 2) { @@ -89,14 +94,12 @@ private void SetState(ArrayList arrayList) var element = subArray[j]; if (element == null) throw new NoNullAllowedException("the element of ArrayList cannot be null."); - list[i,j] = (int) element; + list[i, j] = (int)element; } } Shape shape = new Shape(new int[] { arrayList.Count, secondDim }); - reconstructedArray = list; + reconstructedMultiArray = list; reconstructedNDArray = new NDArray(list, shape); - //SetData(new[] { new Slice() }, new NDArray(list, shape)); - //set_shape(shape); } if (ndim > 2) throw new NotImplementedException("can't handle ArrayList with more than two dimensions."); @@ -104,5 +107,13 @@ private void SetState(ArrayList arrayList) else throw new NotImplementedException(""); } + public static implicit operator Array(MultiArrayPickleWarpper arrayWarpper) + { + return arrayWarpper.reconstructedMultiArray; + } + public static implicit operator NDArray(MultiArrayPickleWarpper arrayWarpper) + { + return arrayWarpper.reconstructedNDArray; + } } } diff --git a/src/TensorFlowNET.Core/tensorflow.cs b/src/TensorFlowNET.Core/tensorflow.cs index dc4e48da8..e368b37cd 100644 --- a/src/TensorFlowNET.Core/tensorflow.cs +++ b/src/TensorFlowNET.Core/tensorflow.cs @@ -14,6 +14,7 @@ You may obtain a copy of the License at limitations under the License. ******************************************************************************/ +using Razorvine.Pickle; using Serilog; using Serilog.Core; using System.Reflection; @@ -22,6 +23,7 @@ limitations under the License. using Tensorflow.Eager; using Tensorflow.Gradients; using Tensorflow.Keras; +using Tensorflow.NumPy.Pickle; namespace Tensorflow { @@ -98,6 +100,10 @@ public tensorflow() "please visit https://github.com/SciSharp/TensorFlow.NET. If it still not work after installing the backend, please submit an " + "issue to https://github.com/SciSharp/TensorFlow.NET/issues"); } + + // register numpy reconstructor for pickle + Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor()); + Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor()); } public string VERSION => c_api.StringPiece(c_api.TF_Version()); diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs index 6808035c6..a992ae84a 100644 --- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs +++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs @@ -5,13 +5,6 @@ using Tensorflow.Keras.Utils; using Tensorflow.NumPy; using System.Linq; -using Google.Protobuf.Collections; -using Microsoft.VisualBasic; -using OneOf.Types; -using static HDF.PInvoke.H5; -using System.Data; -using System.Reflection.Emit; -using System.Xml.Linq; namespace Tensorflow.Keras.Datasets { @@ -70,8 +63,9 @@ namespace Tensorflow.Keras.Datasets public class Imdb { string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; - string file_name = "simple.npz"; + string file_name = "imdb.npz"; string dest_folder = "imdb"; + /// /// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). /// @@ -95,8 +89,9 @@ public DatasetPass load_data(string path = "imdb.npz", { var dst = Download(); var fileBytes = File.ReadAllBytes(Path.Combine(dst, file_name)); - var (x_train, x_test) = LoadX(fileBytes); var (y_train, y_test) = LoadY(fileBytes); + var (x_train, x_test) = LoadX(fileBytes); + /*var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); var x_train_string = new string[lines.Length]; var y_train = np.zeros(new int[] { lines.Length }, np.int64); @@ -129,14 +124,12 @@ public DatasetPass load_data(string path = "imdb.npz", (NDArray, NDArray) LoadX(byte[] bytes) { var y = np.Load_Npz(bytes); - var x_train = y["x_train.npy"]; - var x_test = y["x_test.npy"]; - return (x_train, x_test); + return (y["x_train.npy"], y["x_test.npy"]); } (NDArray, NDArray) LoadY(byte[] bytes) { - var y = np.Load_Npz(bytes); + var y = np.Load_Npz(bytes); return (y["y_train.npy"], y["y_test.npy"]); } diff --git a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs index 778290bb8..db6252efc 100644 --- a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs +++ b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs @@ -1,6 +1,5 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; using System; -using System.Collections.Generic; using System.Linq; using static Tensorflow.Binding; using static Tensorflow.KerasApi; @@ -197,6 +196,7 @@ public void Shuffle() Assert.IsFalse(allEqual); } + [Ignore] [TestMethod] public void GetData() { @@ -209,8 +209,8 @@ public void GetData() var y_val = dataset.Test.Item2; print(len(x_train) + "Training sequences"); print(len(x_val) + "Validation sequences"); - x_train = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_train, maxlen: maxlen); - x_val = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_val, maxlen: maxlen); + //x_train = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_train, maxlen: maxlen); + //x_val = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_val, maxlen: maxlen); } } } From 28c77f53d64dbe78284bf46b00c8c945d76fb31c Mon Sep 17 00:00:00 2001 From: lingbai-kong Date: Fri, 8 Sep 2023 17:38:54 +0800 Subject: [PATCH 4/5] implement Imdb dataset loader --- .../NumPy/Implementation/RandomizedImpl.cs | 4 +- src/TensorFlowNET.Keras/Datasets/Imdb.cs | 186 ++++++++++++------ src/TensorFlowNET.Keras/Utils/data_utils.cs | 47 +++++ .../Dataset/DatasetTest.cs | 28 ++- 4 files changed, 198 insertions(+), 67 deletions(-) diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/RandomizedImpl.cs b/src/TensorFlowNET.Core/NumPy/Implementation/RandomizedImpl.cs index 064c7362f..a707e8aae 100644 --- a/src/TensorFlowNET.Core/NumPy/Implementation/RandomizedImpl.cs +++ b/src/TensorFlowNET.Core/NumPy/Implementation/RandomizedImpl.cs @@ -14,9 +14,9 @@ public class RandomizedImpl public NDArray permutation(NDArray x) => new NDArray(random_ops.random_shuffle(x)); [AutoNumPy] - public void shuffle(NDArray x) + public void shuffle(NDArray x, int? seed = null) { - var y = random_ops.random_shuffle(x); + var y = random_ops.random_shuffle(x, seed); Marshal.Copy(y.BufferToArray(), 0, x.TensorDataPointer, (int)x.bytesize); } diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs index 68364ea67..0266b48bd 100644 --- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs +++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs @@ -3,8 +3,6 @@ using System.IO; using System.Text; using Tensorflow.Keras.Utils; -using Tensorflow.NumPy; -using System.Linq; namespace Tensorflow.Keras.Datasets { @@ -41,14 +39,14 @@ namespace Tensorflow.Keras.Datasets /// `skip_top` limits will be replaced with this character. /// index_from: int. Index actual words with this index and higher. /// Returns: - /// Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. + /// Tuple of Numpy arrays: `(x_train, labels_train), (x_test, labels_test)`. /// /// ** x_train, x_test**: lists of sequences, which are lists of indexes /// (integers). If the num_words argument was specific, the maximum /// possible index value is `num_words - 1`. If the `maxlen` argument was /// specified, the largest possible sequence length is `maxlen`. /// - /// ** y_train, y_test**: lists of integer labels(1 or 0). + /// ** labels_train, labels_test**: lists of integer labels(1 or 0). /// /// Raises: /// ValueError: in case `maxlen` is so low @@ -63,7 +61,6 @@ namespace Tensorflow.Keras.Datasets public class Imdb { string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; - string file_name = "imdb.npz"; string dest_folder = "imdb"; /// @@ -78,43 +75,139 @@ public class Imdb /// /// /// - public DatasetPass load_data(string? path = "imdb.npz", - int num_words = -1, + public DatasetPass load_data( + string path = "imdb.npz", + int? num_words = null, int skip_top = 0, - int maxlen = -1, + int? maxlen = null, int seed = 113, - int start_char = 1, - int oov_char= 2, + int? start_char = 1, + int? oov_char = 2, int index_from = 3) { - if (maxlen == -1) throw new InvalidArgumentError("maxlen must be assigned."); - - var dst = path ?? Download(); - var fileBytes = File.ReadAllBytes(Path.Combine(dst, file_name)); - var (y_train, y_test) = LoadY(fileBytes); + path = data_utils.get_file( + path, + origin: Path.Combine(origin_folder, "imdb.npz"), + file_hash: "69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f" + ); + path = Path.Combine(path, "imdb.npz"); + var fileBytes = File.ReadAllBytes(path); var (x_train, x_test) = LoadX(fileBytes); - - /*var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); - var x_train_string = new string[lines.Length]; - var y_train = np.zeros(new int[] { lines.Length }, np.int64); - for (int i = 0; i < lines.Length; i++) + var (labels_train, labels_test) = LoadY(fileBytes); + x_test.astype(np.int32); + labels_test.astype(np.int32); + + var indices = np.arange(len(x_train)); + np.random.shuffle(indices, seed); + x_train = x_train[indices]; + labels_train = labels_train[indices]; + + indices = np.arange(len(x_test)); + np.random.shuffle(indices, seed); + x_test = x_test[indices]; + labels_test = labels_test[indices]; + + if (start_char != null) + { + int[,] new_x_train = new int[x_train.shape[0], x_train.shape[1] + 1]; + for (var i = 0; i < x_train.shape[0]; i++) + { + new_x_train[i, 0] = (int)start_char; + for (var j = 0; j < x_train.shape[1]; j++) + { + new_x_train[i, j + 1] = x_train[i][j]; + } + } + int[,] new_x_test = new int[x_test.shape[0], x_test.shape[1] + 1]; + for (var i = 0; i < x_test.shape[0]; i++) + { + new_x_test[i, 0] = (int)start_char; + for (var j = 0; j < x_test.shape[1]; j++) + { + new_x_test[i, j + 1] = x_test[i][j]; + } + } + x_train = new NDArray(new_x_train); + x_test = new NDArray(new_x_test); + } + else if (index_from != 0) + { + for (var i = 0; i < x_train.shape[0]; i++) + { + for (var j = 0; j < x_train.shape[1]; j++) + { + if (x_train[i, j] != 0) + x_train[i, j] += index_from; + } + } + for (var i = 0; i < x_test.shape[0]; i++) + { + for (var j = 0; j < x_test.shape[1]; j++) + { + if (x_test[i, j] != 0) + x_test[i, j] += index_from; + } + } + } + + if (maxlen != null) { - y_train[i] = long.Parse(lines[i].Substring(0, 1)); - x_train_string[i] = lines[i].Substring(2); + (x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train, labels_train); + (x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test, labels_test); + if (x_train.size == 0 || x_test.size == 0) + throw new ValueError("After filtering for sequences shorter than maxlen=" + + $"{maxlen}, no sequence was kept. Increase maxlen."); } - var x_train = keras.preprocessing.sequence.pad_sequences(PraseData(x_train_string), maxlen: maxlen); + var xs = np.concatenate(new[] { x_train, x_test }); + var labels = np.concatenate(new[] { labels_train, labels_test }); - lines = File.ReadAllLines(Path.Combine(dst, "imdb_test.txt")); - var x_test_string = new string[lines.Length]; - var y_test = np.zeros(new int[] { lines.Length }, np.int64); - for (int i = 0; i < lines.Length; i++) + if(num_words == null) { - y_test[i] = long.Parse(lines[i].Substring(0, 1)); - x_test_string[i] = lines[i].Substring(2); + num_words = 0; + for (var i = 0; i < xs.shape[0]; i++) + for (var j = 0; j < xs.shape[1]; j++) + num_words = max((int)num_words, (int)xs[i][j]); } - var x_test = np.array(x_test_string);*/ + // by convention, use 2 as OOV word + // reserve 'index_from' (=3 by default) characters: + // 0 (padding), 1 (start), 2 (OOV) + if (oov_char != null) + { + int[,] new_xs = new int[xs.shape[0], xs.shape[1]]; + for(var i = 0; i < xs.shape[0]; i++) + { + for(var j = 0; j < xs.shape[1]; j++) + { + if ((int)xs[i][j] == 0 || skip_top <= (int)xs[i][j] && (int)xs[i][j] < num_words) + new_xs[i, j] = (int)xs[i][j]; + else + new_xs[i, j] = (int)oov_char; + } + } + xs = new NDArray(new_xs); + } + else + { + int[,] new_xs = new int[xs.shape[0], xs.shape[1]]; + for (var i = 0; i < xs.shape[0]; i++) + { + int k = 0; + for (var j = 0; j < xs.shape[1]; j++) + { + if ((int)xs[i][j] == 0 || skip_top <= (int)xs[i][j] && (int)xs[i][j] < num_words) + new_xs[i, k++] = (int)xs[i][j]; + } + } + xs = new NDArray(new_xs); + } + + var idx = len(x_train); + x_train = xs[$"0:{idx}"]; + x_test = xs[$"{idx}:"]; + var y_train = labels[$"0:{idx}"]; + var y_test = labels[$"{idx}:"]; return new DatasetPass { @@ -125,8 +218,8 @@ public DatasetPass load_data(string? path = "imdb.npz", (NDArray, NDArray) LoadX(byte[] bytes) { - var y = np.Load_Npz(bytes); - return (y["x_train.npy"], y["x_test.npy"]); + var x = np.Load_Npz(bytes); + return (x["x_train.npy"], x["x_test.npy"]); } (NDArray, NDArray) LoadY(byte[] bytes) @@ -134,34 +227,5 @@ public DatasetPass load_data(string? path = "imdb.npz", var y = np.Load_Npz(bytes); return (y["y_train.npy"], y["y_test.npy"]); } - - string Download() - { - var dst = Path.Combine(Path.GetTempPath(), dest_folder); - Directory.CreateDirectory(dst); - - Web.Download(origin_folder + file_name, dst, file_name); - - return dst; - // return Path.Combine(dst, file_name); - } - - protected IEnumerable PraseData(string[] x) - { - var data_list = new List(); - for (int i = 0; i < len(x); i++) - { - var list_string = x[i]; - var cleaned_list_string = list_string.Replace("[", "").Replace("]", "").Replace(" ", ""); - string[] number_strings = cleaned_list_string.Split(','); - int[] numbers = new int[number_strings.Length]; - for (int j = 0; j < number_strings.Length; j++) - { - numbers[j] = int.Parse(number_strings[j]); - } - data_list.Add(numbers); - } - return data_list; - } } } diff --git a/src/TensorFlowNET.Keras/Utils/data_utils.cs b/src/TensorFlowNET.Keras/Utils/data_utils.cs index 5b84c601f..16b121b07 100644 --- a/src/TensorFlowNET.Keras/Utils/data_utils.cs +++ b/src/TensorFlowNET.Keras/Utils/data_utils.cs @@ -39,5 +39,52 @@ public static string get_file(string fname, string origin, return datadir; } + + public static (NDArray, NDArray) _remove_long_seq(int maxlen, NDArray seq, NDArray label) + { + /*Removes sequences that exceed the maximum length. + + Args: + maxlen: Int, maximum length of the output sequences. + seq: List of lists, where each sublist is a sequence. + label: List where each element is an integer. + + Returns: + new_seq, new_label: shortened lists for `seq` and `label`. + + */ + List new_seq = new List(); + List new_label = new List(); + + for (var i = 0; i < seq.shape[0]; i++) + { + if (maxlen < seq.shape[1] && seq[i][maxlen] != 0) + continue; + int[] sentence = new int[maxlen]; + for (var j = 0; j < maxlen && j < seq.shape[1]; j++) + { + sentence[j] = seq[i, j]; + } + new_seq.Add(sentence); + new_label.Add(label[i]); + } + + int[,] new_seq_array = new int[new_seq.Count, maxlen]; + int[] new_label_array = new int[new_label.Count]; + + for (var i = 0; i < new_seq.Count; i++) + { + for (var j = 0; j < maxlen; j++) + { + new_seq_array[i, j] = new_seq[i][j]; + } + } + + for (var i = 0; i < new_label.Count; i++) + { + new_label_array[i] = new_label[i]; + } + return (new_seq_array, new_label_array); + } } } diff --git a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs index db6252efc..251eeff90 100644 --- a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs +++ b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs @@ -1,6 +1,8 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; using System; +using System.Collections.Generic; using System.Linq; +using Tensorflow.NumPy; using static Tensorflow.Binding; using static Tensorflow.KerasApi; @@ -207,10 +209,28 @@ public void GetData() var y_train = dataset.Train.Item2; var x_val = dataset.Test.Item1; var y_val = dataset.Test.Item2; - print(len(x_train) + "Training sequences"); - print(len(x_val) + "Validation sequences"); - //x_train = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_train, maxlen: maxlen); - //x_val = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_val, maxlen: maxlen); + + x_train = keras.preprocessing.sequence.pad_sequences(RemoveZeros(x_train), maxlen: maxlen); + x_val = keras.preprocessing.sequence.pad_sequences(RemoveZeros(x_val), maxlen: maxlen); + print(len(x_train) + " Training sequences"); + print(len(x_val) + " Validation sequences"); + } + IEnumerable RemoveZeros(NDArray data) + { + List new_data = new List(); + for (var i = 0; i < data.shape[0]; i++) + { + List new_array = new List(); + for (var j = 0; j < data.shape[1]; j++) + { + if (data[i][j] == 0) + break; + else + new_array.Add((int)data[i][j]); + } + new_data.Add(new_array.ToArray()); + } + return new_data; } } } From f57a6fe6ed006f79511f4cc9550eeda312b11e98 Mon Sep 17 00:00:00 2001 From: lingbai-kong Date: Sat, 9 Sep 2023 18:31:46 +0800 Subject: [PATCH 5/5] optimize the time complexity of Imdb dataset loader --- src/TensorFlowNET.Keras/Datasets/Imdb.cs | 101 ++++++++++-------- src/TensorFlowNET.Keras/Utils/data_utils.cs | 16 +-- .../Dataset/DatasetTest.cs | 11 +- 3 files changed, 71 insertions(+), 57 deletions(-) diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs index 0266b48bd..49fc79251 100644 --- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs +++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs @@ -94,8 +94,6 @@ public DatasetPass load_data( var fileBytes = File.ReadAllBytes(path); var (x_train, x_test) = LoadX(fileBytes); var (labels_train, labels_test) = LoadY(fileBytes); - x_test.astype(np.int32); - labels_test.astype(np.int32); var indices = np.arange(len(x_train)); np.random.shuffle(indices, seed); @@ -107,67 +105,80 @@ public DatasetPass load_data( x_test = x_test[indices]; labels_test = labels_test[indices]; + var x_train_array = (int[,])x_train.ToMultiDimArray(); + var x_test_array = (int[,])x_test.ToMultiDimArray(); + var labels_train_array = (long[])labels_train.ToArray(); + var labels_test_array = (long[])labels_test.ToArray(); + if (start_char != null) { - int[,] new_x_train = new int[x_train.shape[0], x_train.shape[1] + 1]; - for (var i = 0; i < x_train.shape[0]; i++) + int[,] new_x_train_array = new int[x_train_array.GetLength(0), x_train_array.GetLength(1) + 1]; + for (var i = 0; i < x_train_array.GetLength(0); i++) { - new_x_train[i, 0] = (int)start_char; - for (var j = 0; j < x_train.shape[1]; j++) + new_x_train_array[i, 0] = (int)start_char; + for (var j = 0; j < x_train_array.GetLength(1); j++) { - new_x_train[i, j + 1] = x_train[i][j]; + if (x_train_array[i, j] == 0) + break; + new_x_train_array[i, j + 1] = x_train_array[i, j]; } } - int[,] new_x_test = new int[x_test.shape[0], x_test.shape[1] + 1]; - for (var i = 0; i < x_test.shape[0]; i++) + int[,] new_x_test_array = new int[x_test_array.GetLength(0), x_test_array.GetLength(1) + 1]; + for (var i = 0; i < x_test_array.GetLength(0); i++) { - new_x_test[i, 0] = (int)start_char; - for (var j = 0; j < x_test.shape[1]; j++) + new_x_test_array[i, 0] = (int)start_char; + for (var j = 0; j < x_test_array.GetLength(1); j++) { - new_x_test[i, j + 1] = x_test[i][j]; + if (x_test_array[i, j] == 0) + break; + new_x_test_array[i, j + 1] = x_test_array[i, j]; } } - x_train = new NDArray(new_x_train); - x_test = new NDArray(new_x_test); + x_train_array = new_x_train_array; + x_test_array = new_x_test_array; } else if (index_from != 0) { - for (var i = 0; i < x_train.shape[0]; i++) + for (var i = 0; i < x_train_array.GetLength(0); i++) { - for (var j = 0; j < x_train.shape[1]; j++) + for (var j = 0; j < x_train_array.GetLength(1); j++) { - if (x_train[i, j] != 0) - x_train[i, j] += index_from; + if (x_train_array[i, j] == 0) + break; + x_train_array[i, j] += index_from; } } - for (var i = 0; i < x_test.shape[0]; i++) + for (var i = 0; i < x_test_array.GetLength(0); i++) { - for (var j = 0; j < x_test.shape[1]; j++) + for (var j = 0; j < x_test_array.GetLength(1); j++) { - if (x_test[i, j] != 0) - x_test[i, j] += index_from; + if (x_test_array[i, j] == 0) + break; + x_test[i, j] += index_from; } } } - if (maxlen != null) + if (maxlen == null) { - (x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train, labels_train); - (x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test, labels_test); - if (x_train.size == 0 || x_test.size == 0) - throw new ValueError("After filtering for sequences shorter than maxlen=" + - $"{maxlen}, no sequence was kept. Increase maxlen."); + maxlen = max(x_train_array.GetLength(1), x_test_array.GetLength(1)); } + (x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train_array, labels_train_array); + (x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test_array, labels_test_array); + if (x_train.size == 0 || x_test.size == 0) + throw new ValueError("After filtering for sequences shorter than maxlen=" + + $"{maxlen}, no sequence was kept. Increase maxlen."); var xs = np.concatenate(new[] { x_train, x_test }); var labels = np.concatenate(new[] { labels_train, labels_test }); + var xs_array = (int[,])xs.ToMultiDimArray(); - if(num_words == null) + if (num_words == null) { num_words = 0; - for (var i = 0; i < xs.shape[0]; i++) - for (var j = 0; j < xs.shape[1]; j++) - num_words = max((int)num_words, (int)xs[i][j]); + for (var i = 0; i < xs_array.GetLength(0); i++) + for (var j = 0; j < xs_array.GetLength(1); j++) + num_words = max((int)num_words, (int)xs_array[i, j]); } // by convention, use 2 as OOV word @@ -175,32 +186,32 @@ public DatasetPass load_data( // 0 (padding), 1 (start), 2 (OOV) if (oov_char != null) { - int[,] new_xs = new int[xs.shape[0], xs.shape[1]]; - for(var i = 0; i < xs.shape[0]; i++) + int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; + for (var i = 0; i < xs_array.GetLength(0); i++) { - for(var j = 0; j < xs.shape[1]; j++) + for (var j = 0; j < xs_array.GetLength(1); j++) { - if ((int)xs[i][j] == 0 || skip_top <= (int)xs[i][j] && (int)xs[i][j] < num_words) - new_xs[i, j] = (int)xs[i][j]; + if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) + new_xs_array[i, j] = xs_array[i, j]; else - new_xs[i, j] = (int)oov_char; + new_xs_array[i, j] = (int)oov_char; } } - xs = new NDArray(new_xs); + xs = new NDArray(new_xs_array); } else { - int[,] new_xs = new int[xs.shape[0], xs.shape[1]]; - for (var i = 0; i < xs.shape[0]; i++) + int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; + for (var i = 0; i < xs_array.GetLength(0); i++) { int k = 0; - for (var j = 0; j < xs.shape[1]; j++) + for (var j = 0; j < xs_array.GetLength(1); j++) { - if ((int)xs[i][j] == 0 || skip_top <= (int)xs[i][j] && (int)xs[i][j] < num_words) - new_xs[i, k++] = (int)xs[i][j]; + if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) + new_xs_array[i, k++] = xs_array[i, j]; } } - xs = new NDArray(new_xs); + xs = new NDArray(new_xs_array); } var idx = len(x_train); diff --git a/src/TensorFlowNET.Keras/Utils/data_utils.cs b/src/TensorFlowNET.Keras/Utils/data_utils.cs index 16b121b07..57ae76695 100644 --- a/src/TensorFlowNET.Keras/Utils/data_utils.cs +++ b/src/TensorFlowNET.Keras/Utils/data_utils.cs @@ -54,23 +54,25 @@ public static (NDArray, NDArray) _remove_long_seq(int maxlen, NDArray seq, NDArr */ List new_seq = new List(); - List new_label = new List(); + List new_label = new List(); - for (var i = 0; i < seq.shape[0]; i++) + var seq_array = (int[,])seq.ToMultiDimArray(); + var label_array = (long[])label.ToArray(); + for (var i = 0; i < seq_array.GetLength(0); i++) { - if (maxlen < seq.shape[1] && seq[i][maxlen] != 0) + if (maxlen < seq_array.GetLength(1) && seq_array[i,maxlen] != 0) continue; int[] sentence = new int[maxlen]; - for (var j = 0; j < maxlen && j < seq.shape[1]; j++) + for (var j = 0; j < maxlen && j < seq_array.GetLength(1); j++) { - sentence[j] = seq[i, j]; + sentence[j] = seq_array[i, j]; } new_seq.Add(sentence); - new_label.Add(label[i]); + new_label.Add(label_array[i]); } int[,] new_seq_array = new int[new_seq.Count, maxlen]; - int[] new_label_array = new int[new_label.Count]; + long[] new_label_array = new long[new_label.Count]; for (var i = 0; i < new_seq.Count; i++) { diff --git a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs index 251eeff90..183544ab6 100644 --- a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs +++ b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs @@ -204,7 +204,7 @@ public void GetData() { var vocab_size = 20000; // Only consider the top 20k words var maxlen = 200; // Only consider the first 200 words of each movie review - var dataset = keras.datasets.imdb.load_data(num_words: vocab_size); + var dataset = keras.datasets.imdb.load_data(num_words: vocab_size, maxlen: maxlen); var x_train = dataset.Train.Item1; var y_train = dataset.Train.Item2; var x_val = dataset.Test.Item1; @@ -217,16 +217,17 @@ public void GetData() } IEnumerable RemoveZeros(NDArray data) { + var data_array = (int[,])data.ToMultiDimArray(); List new_data = new List(); - for (var i = 0; i < data.shape[0]; i++) + for (var i = 0; i < data_array.GetLength(0); i++) { List new_array = new List(); - for (var j = 0; j < data.shape[1]; j++) + for (var j = 0; j < data_array.GetLength(1); j++) { - if (data[i][j] == 0) + if (data_array[i, j] == 0) break; else - new_array.Add((int)data[i][j]); + new_array.Add(data_array[i, j]); } new_data.Add(new_array.ToArray()); }