Source code for bead.src.utils.normalization

# Custom normalization functions for analysis.

import numpy as np
from . import helper


[docs] def normalize_jet_pj_custom(data): """ Normalizes jet data for HEP analysis using a chained normalization approach. Input data is expected as a NumPy array of shape (N, 7) with columns in the order: 0: event_id (unchanged) 1: jet_id (unchanged) 2: num_constituents (to be normalized via "robust") 3: b_tagged (already integer; left unchanged) 4: jet_pt (to be normalized via "log+standard") 5: jet_eta (to be normalized via "standard") 6: jet_phi (to be normalized via "sin_cos" transformation) The output array will have 8 columns: [event_id, jet_id, num_constituents_norm, b_tagged, jet_pt_norm, jet_eta_norm, jet_phi_sin, jet_phi_cos] Args: data (np.ndarray): Input array of shape (N, 7). Returns: normalized_data (np.ndarray): Output array of shape (N, 8). scalers (dict): Dictionary containing the fitted scalers for each feature. """ # Initialize the dictionary to store the fitted scalers. scalers = {} # 1. event_id, jet_id and b_tagged (columns 0, 1 and 3), unchanged. event_id = data[:, 0].reshape(-1, 1).astype(float) jet_id = data[:, 1].reshape(-1, 1).astype(float) b_tagged = data[:, 3].reshape(-1, 1).astype(float) # 2. num_constituents: column 2, use "robust" num_constituents = data[:, 2].reshape(-1, 1).astype(float) norm_num_const, scalers["num_constituents"] = helper.normalize_data( num_constituents, "robust" ) # 4. jet_pt: column 4, use chain "log+standard" jet_pt = data[:, 4].reshape(-1, 1).astype(float) norm_jet_pt, scalers["jet_pt"] = helper.normalize_data(jet_pt, "log+standard") # 5. jet_eta: column 5, use "standard" jet_eta = data[:, 5].reshape(-1, 1).astype(float) norm_jet_eta, scalers["jet_eta"] = helper.normalize_data(jet_eta, "standard") # 6. jet_phi: column 6, use "sin_cos" jet_phi = data[:, 6].reshape(-1, 1).astype(float) norm_jet_phi, scalers["jet_phi"] = helper.normalize_data(jet_phi, "sincos") # norm_jet_phi will have 2 columns: sin and cos. # Concatenate the processed features: normalized_data = np.hstack( [ event_id, # unchanged jet_id, # unchanged norm_num_const, # normalized num_constituents b_tagged, # unchanged norm_jet_pt, # normalized jet_pt norm_jet_eta, # normalized jet_eta norm_jet_phi, # two columns: jet_phi_sin and jet_phi_cos ] ) return normalized_data, scalers
[docs] def normalize_constit_pj_custom(data): """ Normalizes jet data for HEP analysis using a chained normalization approach. Input data is expected as a NumPy array of shape (N, 7) with columns in the order: 0: event_id (unchanged) 1: jet_id (unchanged) 2: constit_id (unchanged) 3: b_tagged (unchanged) 4: constit_pt (to be normalized via "log+standard") 5: constit_eta (to be normalized via "standard") 6: constit_phi (to be normalized via "sin_cos" transformation) The output array will have 8 columns: [event_id, jet_id, constit_id, b_tagged, constit_pt_norm, constit_eta_norm, constit_phi_sin, constit_phi_cos] Args: data (np.ndarray): Input array of shape (N, 7). Returns: normalized_data (np.ndarray): Output array of shape (N, 8). scalers (dict): Dictionary containing the fitted scalers for each feature. """ # Initialize the dictionary to store the fitted scalers. scalers = {} # 1. event_id, jet_id and constit_id (columns 0 - 3), unchanged. event_id = data[:, 0].reshape(-1, 1).astype(float) jet_id = data[:, 1].reshape(-1, 1).astype(float) constit_id = data[:, 2].reshape(-1, 1).astype(float) b_tagged = data[:, 3].reshape(-1, 1).astype(float) # 4. constit_pt: column 4, use chain "log+standard" constit_pt = data[:, 4].reshape(-1, 1).astype(float) norm_constit_pt, scalers["constit_pt"] = helper.normalize_data( constit_pt, "log+standard" ) # 5. constit_eta: column 5, use "standard" constit_eta = data[:, 5].reshape(-1, 1).astype(float) norm_constit_eta, scalers["constit_eta"] = helper.normalize_data( constit_eta, "standard" ) # 6. constit_phi: column 6, use "sin_cos" constit_phi = data[:, 6].reshape(-1, 1).astype(float) norm_constit_phi, scalers["constit_phi"] = helper.normalize_data( constit_phi, "sincos" ) # norm_constit_phi will have 2 columns: sin and cos. # Concatenate the processed features: normalized_data = np.hstack( [ event_id, # unchanged jet_id, # unchanged constit_id, # unchanged b_tagged, # unchanged norm_constit_pt, # normalized constit_pt norm_constit_eta, # normalized constit_eta norm_constit_phi, # two columns: constit_phi_sin and constit_phi_cos ] ) return normalized_data, scalers
[docs] def invert_normalize_jet_pj_custom(normalized_data, scalers): """ Inverts the normalization applied by normalize_jet_data_np_chained. The input normalized_data is assumed to be a NumPy array of shape (N, 8) with columns: 0: event_id (unchanged) 1: jet_id (unchanged) 2: num_constituents_norm (normalized via "robust") 3: b_tagged (unchanged) 4: jet_pt_norm (normalized via "log+standard") 5: jet_eta_norm (normalized via "standard") 6-7: jet_phi_sin, jet_phi_cos (normalized via "sin_cos") Returns: original_data: NumPy array of shape (N, 7) with columns: [event_id, jet_id, num_constituents, b_tagged, jet_pt, jet_eta, jet_phi] Note: - The scaler for jet_pt (chain "log+standard") is expected to invert first the StandardScaler then the Log1pScaler, so that the original jet_pt is recovered. - The scaler for jet_phi (chain "sin_cos") converts the 2 columns back to the original angle using arctan2. """ # 1. The unchanged columns: event_id, jet_id, b_tagged. event_id = normalized_data[:, 0].reshape(-1, 1) jet_id = normalized_data[:, 1].reshape(-1, 1) b_tagged = normalized_data[:, 3].reshape(-1, 1) # 2. Invert num_constituents (chain: "robust") norm_num_const = normalized_data[:, 2].reshape(-1, 1) original_num_const = helper.invert_normalize_data( norm_num_const, scalers["num_constituents"] ) # 3. Invert jet_pt (chain: "log+standard") norm_jet_pt = normalized_data[:, 4].reshape(-1, 1) original_jet_pt = helper.invert_normalize_data(norm_jet_pt, scalers["jet_pt"]) # 4. Invert jet_eta (chain: "standard") norm_jet_eta = normalized_data[:, 5].reshape(-1, 1) original_jet_eta = helper.invert_normalize_data(norm_jet_eta, scalers["jet_eta"]) # 5. Invert jet_phi (chain: "sin_cos") # The chain "sin_cos" returns 2 columns; we pass these into its inverse_transform. norm_jet_phi = normalized_data[:, 6:8] original_jet_phi = helper.invert_normalize_data(norm_jet_phi, scalers["jet_phi"]) # Concatenate the recovered columns in order: original_data = np.hstack( [ event_id, jet_id, original_num_const, b_tagged, original_jet_pt, original_jet_eta, original_jet_phi, ] ) return original_data
[docs] def invert_normalize_constit_pj_custom(normalized_data, scalers): """ Inverts the normalization applied by normalize_jet_data_np_chained. The input normalized_data is assumed to be a NumPy array of shape (N, 8) with columns: 0: event_id (unchanged) 1: jet_id (unchanged) 2: num_constituents_norm (normalized via "robust") 3: b_tagged (unchanged) 4: jet_pt_norm (normalized via "log+standard") 5: jet_eta_norm (normalized via "standard") 6-7: jet_phi_sin, jet_phi_cos (normalized via "sin_cos") Returns: original_data: NumPy array of shape (N, 7) with columns: [event_id, jet_id, num_constituents, b_tagged, jet_pt, jet_eta, jet_phi] Note: - The scaler for jet_pt (chain "log+standard") is expected to invert first the StandardScaler then the Log1pScaler, so that the original jet_pt is recovered. - The scaler for jet_phi (chain "sin_cos") converts the 2 columns back to the original angle using arctan2. """ # 1. The unchanged columns: event_id, jet_id, constit_id, b_tagged. event_id = normalized_data[:, 0].reshape(-1, 1) jet_id = normalized_data[:, 1].reshape(-1, 1) constit_id = normalized_data[:, 2].reshape(-1, 1) b_tagged = normalized_data[:, 3].reshape(-1, 1) # 3. Invert constit_pt (chain: "log+standard") norm_constit_pt = normalized_data[:, 4].reshape(-1, 1) original_constit_pt = helper.invert_normalize_data( norm_constit_pt, scalers["constit_pt"] ) # 4. Invert constit_eta (chain: "standard") norm_constit_eta = normalized_data[:, 5].reshape(-1, 1) original_constit_eta = helper.invert_normalize_data( norm_constit_eta, scalers["constit_eta"] ) # 5. Invert constit_phi (chain: "sin_cos") # The chain "sin_cos" returns 2 columns; we pass these into its inverse_transform. norm_constit_phi = normalized_data[:, 6:8] original_constit_phi = helper.invert_normalize_data( norm_constit_phi, scalers["constit_phi"] ) # Concatenate the recovered columns in order: original_data = np.hstack( [ event_id, jet_id, original_num_const, b_tagged, original_constit_pt, original_constit_eta, original_constit_phi, ] ) return original_data