Fix carriage return

This commit is contained in:
Eren Dogan 2022-07-22 19:54:35 +03:00
parent e3fe8c3229
commit 03422be48c
1 changed files with 325 additions and 325 deletions

View File

@ -1,325 +1,325 @@
# #
# Copyright (c) 2016-2019 Regents of the University of California and The Board # Copyright (c) 2016-2019 Regents of the University of California and The Board
# of Regents for the Oklahoma Agricultural and Mechanical College # of Regents for the Oklahoma Agricultural and Mechanical College
# (acting for and on behalf of Oklahoma State University) # (acting for and on behalf of Oklahoma State University)
# All rights reserved. # All rights reserved.
# #
import debug import debug
import csv import csv
import math import math
import numpy as np import numpy as np
import os import os
process_transform = {'SS':0.0, 'TT': 0.5, 'FF':1.0} process_transform = {'SS':0.0, 'TT': 0.5, 'FF':1.0}
def get_data_names(file_name, exclude_area=True): def get_data_names(file_name, exclude_area=True):
""" """
Returns just the data names in the first row of the CSV Returns just the data names in the first row of the CSV
""" """
with open(file_name, newline='') as csvfile: with open(file_name, newline='') as csvfile:
csv_reader = csv.reader(csvfile, delimiter=' ', quotechar='|') csv_reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
row_iter = 0 row_iter = 0
# reader is iterable not a list, probably a better way to do this # reader is iterable not a list, probably a better way to do this
for row in csv_reader: for row in csv_reader:
# Return names from first row # Return names from first row
names = row[0].split(',') names = row[0].split(',')
break break
if exclude_area: if exclude_area:
try: try:
area_ind = names.index('area') area_ind = names.index('area')
except ValueError: except ValueError:
area_ind = -1 area_ind = -1
if area_ind != -1: if area_ind != -1:
names = names[:area_ind] + names[area_ind+1:] names = names[:area_ind] + names[area_ind+1:]
return names return names
def get_data(file_name): def get_data(file_name):
""" """
Returns data in CSV as lists of features Returns data in CSV as lists of features
""" """
with open(file_name, newline='') as csvfile: with open(file_name, newline='') as csvfile:
csv_reader = csv.reader(csvfile, delimiter=' ', quotechar='|') csv_reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
row_iter = 0 row_iter = 0
removed_items = 1 removed_items = 1
for row in csv_reader: for row in csv_reader:
row_iter += 1 row_iter += 1
if row_iter == 1: if row_iter == 1:
feature_names = row[0].split(',') feature_names = row[0].split(',')
input_list = [[] for _ in range(len(feature_names)-removed_items)] input_list = [[] for _ in range(len(feature_names)-removed_items)]
try: try:
# Save to remove area # Save to remove area
area_ind = feature_names.index('area') area_ind = feature_names.index('area')
except ValueError: except ValueError:
area_ind = -1 area_ind = -1
try: try:
process_ind = feature_names.index('process') process_ind = feature_names.index('process')
except: except:
debug.error('Process not included as a feature.') debug.error('Process not included as a feature.')
continue continue
data = [] data = []
split_str = row[0].split(',') split_str = row[0].split(',')
for i in range(len(split_str)): for i in range(len(split_str)):
if i == process_ind: if i == process_ind:
data.append(process_transform[split_str[i]]) data.append(process_transform[split_str[i]])
elif i == area_ind: elif i == area_ind:
continue continue
else: else:
data.append(float(split_str[i])) data.append(float(split_str[i]))
data[0] = math.log(data[0], 2) data[0] = math.log(data[0], 2)
for i in range(len(data)): for i in range(len(data)):
input_list[i].append(data[i]) input_list[i].append(data[i])
return input_list return input_list
def apply_samples_to_data(all_data, algo_samples): def apply_samples_to_data(all_data, algo_samples):
# Take samples from algorithm and match them to samples in data # Take samples from algorithm and match them to samples in data
data_samples, unused_data = [], [] data_samples, unused_data = [], []
sample_positions = set() sample_positions = set()
for sample in algo_samples: for sample in algo_samples:
sample_positions.add(find_sample_position_with_min_error(all_data, sample)) sample_positions.add(find_sample_position_with_min_error(all_data, sample))
for i in range(len(all_data)): for i in range(len(all_data)):
if i in sample_positions: if i in sample_positions:
data_samples.append(all_data[i]) data_samples.append(all_data[i])
else: else:
unused_data.append(all_data[i]) unused_data.append(all_data[i])
return data_samples, unused_data return data_samples, unused_data
def find_sample_position_with_min_error(data, sampled_vals): def find_sample_position_with_min_error(data, sampled_vals):
min_error = 0 min_error = 0
sample_pos = 0 sample_pos = 0
count = 0 count = 0
for data_slice in data: for data_slice in data:
error = squared_error(data_slice, sampled_vals) error = squared_error(data_slice, sampled_vals)
if min_error == 0 or error < min_error: if min_error == 0 or error < min_error:
min_error = error min_error = error
sample_pos = count sample_pos = count
count += 1 count += 1
return sample_pos return sample_pos
def squared_error(list_a, list_b): def squared_error(list_a, list_b):
error_sum = 0; error_sum = 0;
for a,b in zip(list_a, list_b): for a,b in zip(list_a, list_b):
error_sum+=(a-b)**2 error_sum+=(a-b)**2
return error_sum return error_sum
def get_max_min_from_datasets(dir): def get_max_min_from_datasets(dir):
if not os.path.isdir(dir): if not os.path.isdir(dir):
debug.warning("Input Directory not found:{}".format(dir)) debug.warning("Input Directory not found:{}".format(dir))
return [], [], [] return [], [], []
# Assuming all files are CSV # Assuming all files are CSV
data_files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))] data_files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
maxs,mins,sums,total_count = [],[],[],0 maxs,mins,sums,total_count = [],[],[],0
for file in data_files: for file in data_files:
data = get_data(os.path.join(dir, file)) data = get_data(os.path.join(dir, file))
# Get max, min, sum, and count from every file # Get max, min, sum, and count from every file
data_max, data_min, data_sum, count = [],[],[], 0 data_max, data_min, data_sum, count = [],[],[], 0
for feature_list in data: for feature_list in data:
data_max.append(max(feature_list)) data_max.append(max(feature_list))
data_min.append(min(feature_list)) data_min.append(min(feature_list))
data_sum.append(sum(feature_list)) data_sum.append(sum(feature_list))
count = len(feature_list) count = len(feature_list)
# Aggregate the data # Aggregate the data
if not maxs or not mins or not sums: if not maxs or not mins or not sums:
maxs,mins,sums,total_count = data_max,data_min,data_sum,count maxs,mins,sums,total_count = data_max,data_min,data_sum,count
else: else:
for i in range(len(maxs)): for i in range(len(maxs)):
maxs[i] = max(data_max[i], maxs[i]) maxs[i] = max(data_max[i], maxs[i])
mins[i] = min(data_min[i], mins[i]) mins[i] = min(data_min[i], mins[i])
sums[i] = data_sum[i]+sums[i] sums[i] = data_sum[i]+sums[i]
total_count+=count total_count+=count
avgs = [s/total_count for s in sums] avgs = [s/total_count for s in sums]
return maxs,mins,avgs return maxs,mins,avgs
def get_max_min_from_file(path): def get_max_min_from_file(path):
if not os.path.isfile(path): if not os.path.isfile(path):
debug.warning("Input file not found: {}".format(path)) debug.warning("Input file not found: {}".format(path))
return [], [], [] return [], [], []
data = get_data(path) data = get_data(path)
# Get max, min, sum, and count from every file # Get max, min, sum, and count from every file
data_max, data_min, data_sum, count = [],[],[], 0 data_max, data_min, data_sum, count = [],[],[], 0
for feature_list in data: for feature_list in data:
data_max.append(max(feature_list)) data_max.append(max(feature_list))
data_min.append(min(feature_list)) data_min.append(min(feature_list))
data_sum.append(sum(feature_list)) data_sum.append(sum(feature_list))
count = len(feature_list) count = len(feature_list)
avgs = [s/count for s in data_sum] avgs = [s/count for s in data_sum]
return data_max, data_min, avgs return data_max, data_min, avgs
def get_data_and_scale(file_name, sample_dir): def get_data_and_scale(file_name, sample_dir):
maxs,mins,avgs = get_max_min_from_datasets(sample_dir) maxs,mins,avgs = get_max_min_from_datasets(sample_dir)
# Get data # Get data
all_data = get_data(file_name) all_data = get_data(file_name)
# Scale data from file # Scale data from file
self_scaled_data = [[] for _ in range(len(all_data[0]))] self_scaled_data = [[] for _ in range(len(all_data[0]))]
self_maxs,self_mins = [],[] self_maxs,self_mins = [],[]
for feature_list, cur_max, cur_min in zip(all_data,maxs, mins): for feature_list, cur_max, cur_min in zip(all_data,maxs, mins):
for i in range(len(feature_list)): for i in range(len(feature_list)):
self_scaled_data[i].append((feature_list[i]-cur_min)/(cur_max-cur_min)) self_scaled_data[i].append((feature_list[i]-cur_min)/(cur_max-cur_min))
return np.asarray(self_scaled_data) return np.asarray(self_scaled_data)
def rescale_data(data, old_maxs, old_mins, new_maxs, new_mins): def rescale_data(data, old_maxs, old_mins, new_maxs, new_mins):
# unscale from old values, rescale by new values # unscale from old values, rescale by new values
data_new_scaling = [] data_new_scaling = []
for data_row in data: for data_row in data:
scaled_row = [] scaled_row = []
for val, old_max,old_min, cur_max, cur_min in zip(data_row, old_maxs,old_mins, new_maxs, new_mins): for val, old_max,old_min, cur_max, cur_min in zip(data_row, old_maxs,old_mins, new_maxs, new_mins):
unscaled_data = val*(old_max-old_min) + old_min unscaled_data = val*(old_max-old_min) + old_min
scaled_row.append((unscaled_data-cur_min)/(cur_max-cur_min)) scaled_row.append((unscaled_data-cur_min)/(cur_max-cur_min))
data_new_scaling.append(scaled_row) data_new_scaling.append(scaled_row)
return data_new_scaling return data_new_scaling
def sample_from_file(num_samples, file_name, sample_dir=None): def sample_from_file(num_samples, file_name, sample_dir=None):
""" """
Get a portion of the data from CSV file and scale it based on max/min of dataset. Get a portion of the data from CSV file and scale it based on max/min of dataset.
Duplicate samples are trimmed. Duplicate samples are trimmed.
""" """
if sample_dir: if sample_dir:
maxs,mins,avgs = get_max_min_from_datasets(sample_dir) maxs,mins,avgs = get_max_min_from_datasets(sample_dir)
else: else:
maxs,mins,avgs = [], [], [] maxs,mins,avgs = [], [], []
# Get data # Get data
all_data = get_data(file_name) all_data = get_data(file_name)
# Get algorithms sample points, assuming hypercube for now # Get algorithms sample points, assuming hypercube for now
num_labels = 1 num_labels = 1
inp_dims = len(all_data) - num_labels inp_dims = len(all_data) - num_labels
samples = np.random.rand(num_samples, inp_dims) samples = np.random.rand(num_samples, inp_dims)
# Scale data from file # Scale data from file
self_scaled_data = [[] for _ in range(len(all_data[0]))] self_scaled_data = [[] for _ in range(len(all_data[0]))]
self_maxs,self_mins = [],[] self_maxs,self_mins = [],[]
for feature_list in all_data: for feature_list in all_data:
max_val = max(feature_list) max_val = max(feature_list)
self_maxs.append(max_val) self_maxs.append(max_val)
min_val = min(feature_list) min_val = min(feature_list)
self_mins.append(min_val) self_mins.append(min_val)
for i in range(len(feature_list)): for i in range(len(feature_list)):
self_scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val)) self_scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val))
# Apply algorithm sampling points to available data # Apply algorithm sampling points to available data
sampled_data, unused_data = apply_samples_to_data(self_scaled_data,samples) sampled_data, unused_data = apply_samples_to_data(self_scaled_data,samples)
#unscale values and rescale using all available data (both sampled and unused points rescaled) #unscale values and rescale using all available data (both sampled and unused points rescaled)
if len(maxs)!=0 and len(mins)!=0: if len(maxs)!=0 and len(mins)!=0:
sampled_data = rescale_data(sampled_data, self_maxs,self_mins, maxs, mins) sampled_data = rescale_data(sampled_data, self_maxs,self_mins, maxs, mins)
unused_new_scaling = rescale_data(unused_data, self_maxs,self_mins, maxs, mins) unused_new_scaling = rescale_data(unused_data, self_maxs,self_mins, maxs, mins)
return np.asarray(sampled_data), np.asarray(unused_new_scaling) return np.asarray(sampled_data), np.asarray(unused_new_scaling)
def get_scaled_data(file_name): def get_scaled_data(file_name):
"""Get data from CSV file and scale it based on max/min of dataset""" """Get data from CSV file and scale it based on max/min of dataset"""
if file_name: if file_name:
maxs,mins,avgs = get_max_min_from_file(file_name) maxs,mins,avgs = get_max_min_from_file(file_name)
else: else:
maxs,mins,avgs = [], [], [] maxs,mins,avgs = [], [], []
# Get data # Get data
all_data = get_data(file_name) all_data = get_data(file_name)
# Data is scaled by max/min and data format is changed to points vs feature lists # Data is scaled by max/min and data format is changed to points vs feature lists
self_scaled_data = scale_data_and_transform(all_data) self_scaled_data = scale_data_and_transform(all_data)
data_np = np.asarray(self_scaled_data) data_np = np.asarray(self_scaled_data)
return data_np return data_np
def scale_data_and_transform(data): def scale_data_and_transform(data):
""" """
Assume data is a list of features, change to a list of points and max/min scale Assume data is a list of features, change to a list of points and max/min scale
""" """
scaled_data = [[] for _ in range(len(data[0]))] scaled_data = [[] for _ in range(len(data[0]))]
for feature_list in data: for feature_list in data:
max_val = max(feature_list) max_val = max(feature_list)
min_val = min(feature_list) min_val = min(feature_list)
for i in range(len(feature_list)): for i in range(len(feature_list)):
if max_val == min_val: if max_val == min_val:
scaled_data[i].append(0.0) scaled_data[i].append(0.0)
else: else:
scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val)) scaled_data[i].append((feature_list[i]-min_val)/(max_val-min_val))
return scaled_data return scaled_data
def scale_input_datapoint(point, file_path): def scale_input_datapoint(point, file_path):
""" """
Input data has no output and needs to be scaled like the model inputs during Input data has no output and needs to be scaled like the model inputs during
training. training.
""" """
maxs, mins, avgs = get_max_min_from_file(file_path) maxs, mins, avgs = get_max_min_from_file(file_path)
debug.info(3, "maxs={}".format(maxs)) debug.info(3, "maxs={}".format(maxs))
debug.info(3, "mins={}".format(mins)) debug.info(3, "mins={}".format(mins))
debug.info(3, "point={}".format(point)) debug.info(3, "point={}".format(point))
scaled_point = [] scaled_point = []
for feature, mx, mn in zip(point, maxs, mins): for feature, mx, mn in zip(point, maxs, mins):
if mx == mn: if mx == mn:
scaled_point.append(0.0) scaled_point.append(0.0)
else: else:
scaled_point.append((feature-mn)/(mx-mn)) scaled_point.append((feature-mn)/(mx-mn))
return scaled_point return scaled_point
def unscale_data(data, file_path, pos=None): def unscale_data(data, file_path, pos=None):
if file_path: if file_path:
maxs,mins,avgs = get_max_min_from_file(file_path) maxs,mins,avgs = get_max_min_from_file(file_path)
else: else:
debug.error("Must provide reference data to unscale") debug.error("Must provide reference data to unscale")
return None return None
# Hard coded to only convert the last max/min (i.e. the label of the data) # Hard coded to only convert the last max/min (i.e. the label of the data)
if pos == None: if pos == None:
maxs,mins,avgs = maxs[-1],mins[-1],avgs[-1] maxs,mins,avgs = maxs[-1],mins[-1],avgs[-1]
else: else:
maxs,mins,avgs = maxs[pos],mins[pos],avgs[pos] maxs,mins,avgs = maxs[pos],mins[pos],avgs[pos]
unscaled_data = [] unscaled_data = []
for data_row in data: for data_row in data:
unscaled_val = data_row*(maxs-mins) + mins unscaled_val = data_row*(maxs-mins) + mins
unscaled_data.append(unscaled_val) unscaled_data.append(unscaled_val)
return unscaled_data return unscaled_data
def abs_error(labels, preds): def abs_error(labels, preds):
total_error = 0 total_error = 0
for label_i, pred_i in zip(labels, preds): for label_i, pred_i in zip(labels, preds):
cur_error = abs(label_i[0]-pred_i[0])/label_i[0] cur_error = abs(label_i[0]-pred_i[0])/label_i[0]
total_error += cur_error total_error += cur_error
return total_error/len(labels) return total_error/len(labels)
def max_error(labels, preds): def max_error(labels, preds):
mx_error = 0 mx_error = 0
for label_i, pred_i in zip(labels, preds): for label_i, pred_i in zip(labels, preds):
cur_error = abs(label_i[0]-pred_i[0])/label_i[0] cur_error = abs(label_i[0]-pred_i[0])/label_i[0]
mx_error = max(cur_error, mx_error) mx_error = max(cur_error, mx_error)
return mx_error return mx_error
def min_error(labels, preds): def min_error(labels, preds):
mn_error = 1 mn_error = 1
for label_i, pred_i in zip(labels, preds): for label_i, pred_i in zip(labels, preds):
cur_error = abs(label_i[0]-pred_i[0])/label_i[0] cur_error = abs(label_i[0]-pred_i[0])/label_i[0]
mn_error = min(cur_error, mn_error) mn_error = min(cur_error, mn_error)
return mn_error return mn_error