Data scaling is only dependent on a single file rather than a directory now.

This commit is contained in:
Hunter Nichols 2020-12-09 15:03:04 -08:00
parent fc55cd194d
commit 393a9ca0d8
2 changed files with 29 additions and 11 deletions

View File

@ -110,6 +110,24 @@ def get_max_min_from_datasets(dir):
avgs = [s/total_count for s in sums] avgs = [s/total_count for s in sums]
return maxs,mins,avgs return maxs,mins,avgs
def get_max_min_from_file(path):
if not os.path.isfile(path):
debug.warning("Input file not found: {}".format(path))
return [], [], []
data = get_data(path)
# Get max, min, sum, and count from every file
data_max, data_min, data_sum, count = [],[],[], 0
for feature_list in data:
data_max.append(max(feature_list))
data_min.append(min(feature_list))
data_sum.append(sum(feature_list))
count = len(feature_list)
avgs = [s/count for s in data_sum]
return data_max, data_min, avgs
def get_data_and_scale(file_name, sample_dir): def get_data_and_scale(file_name, sample_dir):
maxs,mins,avgs = get_max_min_from_datasets(sample_dir) maxs,mins,avgs = get_max_min_from_datasets(sample_dir)
@ -179,11 +197,11 @@ def sample_from_file(num_samples, file_name, sample_dir=None):
return np.asarray(sampled_data), np.asarray(unused_new_scaling) return np.asarray(sampled_data), np.asarray(unused_new_scaling)
def get_scaled_data(file_name, sample_dir=None): def get_scaled_data(file_name):
"""Get data from CSV file and scale it based on max/min of dataset""" """Get data from CSV file and scale it based on max/min of dataset"""
if sample_dir: if file_name:
maxs,mins,avgs = get_max_min_from_datasets(sample_dir) maxs,mins,avgs = get_max_min_from_file(file_name)
else: else:
maxs,mins,avgs = [], [], [] maxs,mins,avgs = [], [], []
@ -210,12 +228,12 @@ def scale_data_and_transform(data):
scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val)) scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val))
return scaled_data return scaled_data
def scale_input_datapoint(point, data_dir): def scale_input_datapoint(point, file_path):
""" """
Input data has no output and needs to be scaled like the model inputs during Input data has no output and needs to be scaled like the model inputs during
training. training.
""" """
maxs, mins, avgs = get_max_min_from_datasets(data_dir) maxs, mins, avgs = get_max_min_from_file(file_path)
debug.info(1, "maxs={}".format(maxs)) debug.info(1, "maxs={}".format(maxs))
debug.info(1, "mins={}".format(mins)) debug.info(1, "mins={}".format(mins))
debug.info(1, "point={}".format(point)) debug.info(1, "point={}".format(point))
@ -225,9 +243,9 @@ def scale_input_datapoint(point, data_dir):
scaled_point.append((feature-mn)/(mx-mn)) scaled_point.append((feature-mn)/(mx-mn))
return scaled_point return scaled_point
def unscale_data(data, ref_dir, pos=None): def unscale_data(data, file_path, pos=None):
if ref_dir: if file_path:
maxs,mins,avgs = get_max_min_from_datasets(ref_dir) maxs,mins,avgs = get_max_min_from_file(file_path)
else: else:
print("Must provide reference data to unscale") print("Must provide reference data to unscale")
return None return None

View File

@ -25,12 +25,12 @@ class linear_regression():
def get_prediction(self, model_inputs): def get_prediction(self, model_inputs):
file_path = data_dir +'/'+data_filename file_path = data_dir +'/'+data_filename
scaled_inputs = np.asarray([scale_input_datapoint(model_inputs, data_dir)]) scaled_inputs = np.asarray([scale_input_datapoint(model_inputs, file_path)])
features, labels = get_scaled_data(file_path, data_dir) features, labels = get_scaled_data(file_path)
self.train_model(features, labels) self.train_model(features, labels)
scaled_pred = self.model_prediction(scaled_inputs) scaled_pred = self.model_prediction(scaled_inputs)
pred = unscale_data(scaled_pred.tolist(), data_dir) pred = unscale_data(scaled_pred.tolist(), file_path)
debug.info(1,"Unscaled Prediction = {}".format(pred)) debug.info(1,"Unscaled Prediction = {}".format(pred))
return pred return pred