Data scaling is only dependent on a single file rather than a directory now.

2020-12-09 15:03:04 -08:00 · 2020-12-09 15:03:04 -08:00 · 393a9ca0d8
parent fc55cd194d
commit 393a9ca0d8
2 changed files with 29 additions and 11 deletions
--- a/compiler/characterizer/analytical_util.py
+++ b/compiler/characterizer/analytical_util.py
@ -110,6 +110,24 @@ def get_max_min_from_datasets(dir):
    avgs = [s/total_count for s in sums]
    return maxs,mins,avgs
    
+def get_max_min_from_file(path):
+    if not os.path.isfile(path):
+        debug.warning("Input file not found: {}".format(path))
+        return [], [], []
+    
+
+    data = get_data(path)
+    # Get max, min, sum, and count from every file
+    data_max, data_min, data_sum, count = [],[],[], 0
+    for feature_list in data:
+        data_max.append(max(feature_list))
+        data_min.append(min(feature_list))
+        data_sum.append(sum(feature_list))
+        count = len(feature_list)
+
+    avgs = [s/count for s in data_sum]
+    return data_max, data_min, avgs    
+    
 def get_data_and_scale(file_name, sample_dir):
    maxs,mins,avgs = get_max_min_from_datasets(sample_dir)
    
@ -179,11 +197,11 @@ def sample_from_file(num_samples, file_name, sample_dir=None):
        
    return np.asarray(sampled_data), np.asarray(unused_new_scaling)

-def get_scaled_data(file_name, sample_dir=None):
+def get_scaled_data(file_name):
    """Get data from CSV file and scale it based on max/min of dataset"""

-    if sample_dir:
-        maxs,mins,avgs = get_max_min_from_datasets(sample_dir)
+    if file_name:
+        maxs,mins,avgs = get_max_min_from_file(file_name)
    else:
        maxs,mins,avgs = [], [], []
        
@ -210,12 +228,12 @@ def scale_data_and_transform(data):
            scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val))
    return scaled_data
    
-def scale_input_datapoint(point, data_dir):    
+def scale_input_datapoint(point, file_path):    
    """
    Input data has no output and needs to be scaled like the model inputs during
    training.
    """
-    maxs, mins, avgs = get_max_min_from_datasets(data_dir)
+    maxs, mins, avgs = get_max_min_from_file(file_path)
    debug.info(1, "maxs={}".format(maxs))
    debug.info(1, "mins={}".format(mins))
    debug.info(1, "point={}".format(point))
@ -225,9 +243,9 @@ def scale_input_datapoint(point, data_dir):
        scaled_point.append((feature-mn)/(mx-mn))
    return scaled_point

-def unscale_data(data, ref_dir, pos=None):
-    if ref_dir:
-        maxs,mins,avgs = get_max_min_from_datasets(ref_dir)
+def unscale_data(data, file_path, pos=None):
+    if file_path:
+        maxs,mins,avgs = get_max_min_from_file(file_path)
    else:
        print("Must provide reference data to unscale")
        return None
--- a/compiler/characterizer/linear_regression.py
+++ b/compiler/characterizer/linear_regression.py
@ -25,12 +25,12 @@ class linear_regression():
    def get_prediction(self, model_inputs): 
               
        file_path = data_dir +'/'+data_filename
-        scaled_inputs = np.asarray([scale_input_datapoint(model_inputs, data_dir)])
+        scaled_inputs = np.asarray([scale_input_datapoint(model_inputs, file_path)])

-        features, labels = get_scaled_data(file_path, data_dir)
+        features, labels = get_scaled_data(file_path)
        self.train_model(features, labels)
        scaled_pred = self.model_prediction(scaled_inputs)
-        pred = unscale_data(scaled_pred.tolist(), data_dir)
+        pred = unscale_data(scaled_pred.tolist(), file_path)
        debug.info(1,"Unscaled Prediction = {}".format(pred))
        return pred