import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
from participantdatagenerator import ParticipantDataGenerator
REQUIRED_ROWS = 300
distribution_file_path = "Participiants Datafields Structure description.xlsx"
participant_data_generator = ParticipantDataGenerator(REQUIRED_ROWS)
df = participant_data_generator.create_list_of_predefined_values(
distribution_file_path
)
df.head()
person's unique identifier | gender | occupations | willingness for geographical mobility | year of birth | nationality | highest level of educational attainment | sector of highest educational attainment | highest level of educational attainment: EQF | target group 1 | ... | 3. has a (micro-)certificate of existing competence | 4. transversal skills category | 4. number of hours of trainings in the last year in which you participated in this category | 4. name of existing competence | 4. level of existing competence | 4. has a (micro-)certificate of existing competence | individual's learning goals 2 | state's goal for individual's learning 2 | employer's goal for individual's learning 2 | indication of development goals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | P682 | male | Hospitality, Retail and Other Services Managers | medium (within neighboring region) | 1979 | Andorran | Post-secondary non-tertiary education | Business, administration and law | N/A | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | P280 | male | Hospitality, Retail and Other Services Managers | high (possible to move temporarily) | 1967 | Turkish | Early childhood education (‘less than primary’... | Generic programmes and qualifications | N/A | civil servants | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | P652 | no answer | unclassifiable occupation | low (within NUTS2 region) | 1998 | Irish | Short-cycle tertiary education | Natural sciences, mathematics and statistics | N/A | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | P208 | male | Commissioned Armed Forces Officers | low (within NUTS2 region) | 1967 | Montenegrin | Short-cycle tertiary education | Natural sciences, mathematics and statistics | N/A | socially disadvantaged people | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | P703 | male | Non-commissioned Armed Forces Officers | low (within NUTS2 region) | 1983 | Belarussian | Bachelor’s or equivalent level | Information and Communication Technologies | N/A | change residence within the EU | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
df.shape
(300, 55)
column_name_fields = original_df = pd.read_excel("MASTER database for matching_v4_dmlab.xlsx", sheet_name = "P_field_name")
columns_mapping = dict(zip(column_name_fields['Participiants table_Datafield name'], column_name_fields['Datafield label']))
original_df = pd.read_excel("MASTER database for matching_v4_dmlab.xlsx", sheet_name = "P_Data_300")
original_df.columns = original_df.columns.to_series().map(columns_mapping)
original_df.shape
(300, 57)
original_df.head()
1. training matching | 2. training matching | 3. training matching | person's unique identifier | gender | year of birth | nationality | country of residence | place of residence | highest level of educational attainment: ISCED | ... | 4. number of hours of trainings in the last year in which you participated in this category | 4. name of existing competence | 4. level of existing competence | 4. has a (micro-)certificate of existing competence | individual's learning goals 1 | individual's learning goals 2 | state's goal for individual's learning 1 | state's goal for individual's learning 2 | employer's goal for individual's learning 1 | employer's goal for individual's learning 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | M_T083 | NaN | NaN | CZ_P71 | male | 1972 | Czech | The Czech Republic_Czechia | Česko - Praha | Primary education | ... | 2.0 | 2.6 Managing digital identity (DigComp) | basic | no | spoken interaction (CEFRL) | 1.1 Browsing, searching and filtering data, in... | spoken interaction (CEFRL) | 1.1 Browsing, searching and filtering data, in... | NaN | NaN |
1 | M_T028 | M_T093 | NaN | H_P190 | female | 1964 | Hungarian | Hungary | Magyarország - Pest | Primary education | ... | NaN | NaN | NaN | NaN | P3 wellbeing (LifeComp) | P2 flexibility (LifeComp) | Valuing sustainability (GreenComp) | 1.3 Managing data, information and digital con... | NaN | NaN |
2 | M_T069 | M_T021 | NaN | CZ_P29 | female | 1972 | Czech | The Czech Republic_Czechia | Česko - Střední Čechy | Primary education | ... | NaN | NaN | NaN | NaN | 1.3 Managing data, information and digital con... | Financial & economic literacy (EntreComp) | Competence relevant for daily life competence ... | Digital financial competence (FinComp) | NaN | NaN |
3 | M_T029 | M_T084a | NaN | PL_P209 | male | 1959 | Polish | Poland | Polska - Świętokrzyskie | Doctoral or equivalent level | ... | NaN | NaN | NaN | NaN | Working with others (EntreComp) | Coping with uncertainty, ambiguity & risk (Ent... | 2.4 Collaborating through digital technologies... | L1 growth mindset (LifeComp) | NaN | NaN |
4 | M_T042 | M_T055 | NaN | H_P117 | female | 2001 | Hungarian | Hungary | NaN | Lower secondary education | ... | NaN | NaN | NaN | NaN | NaN | NaN | S2 communication (LifeComp) | NaN | spoken interaction (CEFRL) | NaN |
5 rows × 57 columns
columns_to_plot = list(set(df.columns) & set(original_df.columns))
len(columns_to_plot)
50
exclude_from_plots = [
"person's unique identifier",
"country of residence",
"place of residence",
"target group 2",
"target group 3",
"native language 1",
"native language 2",
"level of the foreign language 1",
"foreign language 2",
"level of the foreign language 2",
"maximum volume of training to be undertaken",
"1. level of existing competence",
"2. trasversal skill category",
"2. number of hours of trainings in the last year in which you participated in this category",
"2. name of existing competence",
"2. has a (micro-)certificate of existing competence",
"3. transversal skills category",
"3. number of hours of trainings in the last year in which you participated in this category",
"3. name of existing competence",
"3. level of existing competence",
"3. has a (micro-)certificate of existing competence",
"4. transversal skills category",
"4. number of hours of trainings in the last year in which you participated in this category",
"4. name of existing competence",
"4. level of existing competence",
"4. has a (micro-)certificate of existing competence",
"individual's learning goals 2",
"state's goal for individual's learning 2",
"employer's goal for individual's learning 2",
"indication of development goals",
"highest level of educational attainment: EQF",
"1. name of existing competence",
"state's goal for individual's learning 1",
"individual's learning goals 1",
"employer's goal for individual's learning 1"
]
num_columns = len([i for i in columns_to_plot if i not in exclude_from_plots])
columns_per_row = 3
num_rows = (num_columns + columns_per_row - 1) // columns_per_row
fig, axes = plt.subplots(num_rows, columns_per_row, figsize=(15, 5 * num_rows), layout="constrained")
axes = axes.flatten()
for col, ax in zip([i for i in columns_to_plot if i not in exclude_from_plots], axes):
new = df[col].tolist()
og = original_df[col].tolist()
data = pd.DataFrame({'new': new, 'original': og, 'column': col})
data = data.melt(id_vars='column')
p = sns.histplot(data=data, x='value', hue='variable', ax=ax)
p.tick_params(axis='x', rotation=90)
ax.set_title(col)