import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
from participantdatagenerator import ParticipantDataGenerator


REQUIRED_ROWS = 300


distribution_file_path = "Participiants Datafields Structure description.xlsx"


participant_data_generator = ParticipantDataGenerator(REQUIRED_ROWS)


df = participant_data_generator.create_list_of_predefined_values(
        distribution_file_path
)


df.head()


df.shape

(300, 55)


column_name_fields = original_df = pd.read_excel("MASTER database for matching_v4_dmlab.xlsx", sheet_name = "P_field_name")


columns_mapping = dict(zip(column_name_fields['Participiants table_Datafield name'], column_name_fields['Datafield label']))


original_df = pd.read_excel("MASTER database for matching_v4_dmlab.xlsx", sheet_name = "P_Data_300")


original_df.columns = original_df.columns.to_series().map(columns_mapping)


original_df.shape

(300, 57)


original_df.head()


columns_to_plot = list(set(df.columns) & set(original_df.columns))


len(columns_to_plot)

50


exclude_from_plots = [
            "person's unique identifier",
            "country of residence",
            "place of residence",
            "target group 2",
            "target group 3",
            "native language  1",
            "native language  2",
            "level of the foreign language 1",
            "foreign language 2",
            "level of the foreign language 2",
            "maximum volume of training to be undertaken",
            "1. level of existing competence",
            "2. trasversal skill category",
            "2. number of hours of trainings in the last year in which you participated in this category",
            "2. name of existing competence",
            "2. has a (micro-)certificate of existing competence",
            "3. transversal skills category",
            "3. number of hours of trainings in the last year in which you participated in this category",
            "3. name of existing competence",
            "3. level of existing competence",
            "3. has a (micro-)certificate of existing competence",
            "4. transversal skills category",
            "4. number of hours of trainings in the last year in which you participated in this category",
            "4. name of existing competence",
            "4. level of existing competence",
            "4. has a (micro-)certificate of existing competence",
            "individual's learning goals 2",
            "state's goal for individual's learning 2",
            "employer's goal for individual's learning 2",
            "indication of development goals",
            "highest level of educational attainment: EQF",
            "1. name of existing competence",
            "state's goal for individual's learning 1",
            "individual's learning goals 1",
            "employer's goal for individual's learning 1"
            
        ]


num_columns = len([i for i in columns_to_plot if i not in exclude_from_plots])
columns_per_row = 3
num_rows = (num_columns + columns_per_row - 1) // columns_per_row
fig, axes = plt.subplots(num_rows, columns_per_row, figsize=(15, 5 * num_rows), layout="constrained")
axes = axes.flatten()

for col, ax in zip([i for i in columns_to_plot if i not in exclude_from_plots], axes):
    new = df[col].tolist()
    og = original_df[col].tolist()
    data = pd.DataFrame({'new': new, 'original': og, 'column': col})
    data = data.melt(id_vars='column')


    p = sns.histplot(data=data, x='value', hue='variable', ax=ax)
    p.tick_params(axis='x', rotation=90)
    ax.set_title(col)

	person's unique identifier	gender	occupations	willingness for geographical mobility	year of birth	nationality	highest level of educational attainment	sector of highest educational attainment	highest level of educational attainment: EQF	target group 1	...	3. has a (micro-)certificate of existing competence	4. transversal skills category	4. number of hours of trainings in the last year in which you participated in this category	4. name of existing competence	4. level of existing competence	4. has a (micro-)certificate of existing competence	individual's learning goals 2	state's goal for individual's learning 2	employer's goal for individual's learning 2	indication of development goals
0	P682	male	Hospitality, Retail and Other Services Managers	medium (within neighboring region)	1979	Andorran	Post-secondary non-tertiary education	Business, administration and law	N/A	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	P280	male	Hospitality, Retail and Other Services Managers	high (possible to move temporarily)	1967	Turkish	Early childhood education (‘less than primary’...	Generic programmes and qualifications	N/A	civil servants	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	P652	no answer	unclassifiable occupation	low (within NUTS2 region)	1998	Irish	Short-cycle tertiary education	Natural sciences, mathematics and statistics	N/A	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	P208	male	Commissioned Armed Forces Officers	low (within NUTS2 region)	1967	Montenegrin	Short-cycle tertiary education	Natural sciences, mathematics and statistics	N/A	socially disadvantaged people	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	P703	male	Non-commissioned Armed Forces Officers	low (within NUTS2 region)	1983	Belarussian	Bachelor’s or equivalent level	Information and Communication Technologies	N/A	change residence within the EU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	1. training matching	2. training matching	3. training matching	person's unique identifier	gender	year of birth	nationality	country of residence	place of residence	highest level of educational attainment: ISCED	...	4. number of hours of trainings in the last year in which you participated in this category	4. name of existing competence	4. level of existing competence	4. has a (micro-)certificate of existing competence	individual's learning goals 1	individual's learning goals 2	state's goal for individual's learning 1	state's goal for individual's learning 2	employer's goal for individual's learning 1	employer's goal for individual's learning 2
0	M_T083	NaN	NaN	CZ_P71	male	1972	Czech	The Czech Republic_Czechia	Česko - Praha	Primary education	...	2.0	2.6 Managing digital identity (DigComp)	basic	no	spoken interaction (CEFRL)	1.1 Browsing, searching and filtering data, in...	spoken interaction (CEFRL)	1.1 Browsing, searching and filtering data, in...	NaN	NaN
1	M_T028	M_T093	NaN	H_P190	female	1964	Hungarian	Hungary	Magyarország - Pest	Primary education	...	NaN	NaN	NaN	NaN	P3 wellbeing (LifeComp)	P2 flexibility (LifeComp)	Valuing sustainability (GreenComp)	1.3 Managing data, information and digital con...	NaN	NaN
2	M_T069	M_T021	NaN	CZ_P29	female	1972	Czech	The Czech Republic_Czechia	Česko - Střední Čechy	Primary education	...	NaN	NaN	NaN	NaN	1.3 Managing data, information and digital con...	Financial & economic literacy (EntreComp)	Competence relevant for daily life competence ...	Digital financial competence (FinComp)	NaN	NaN
3	M_T029	M_T084a	NaN	PL_P209	male	1959	Polish	Poland	Polska - Świętokrzyskie	Doctoral or equivalent level	...	NaN	NaN	NaN	NaN	Working with others (EntreComp)	Coping with uncertainty, ambiguity & risk (Ent...	2.4 Collaborating through digital technologies...	L1 growth mindset (LifeComp)	NaN	NaN
4	M_T042	M_T055	NaN	H_P117	female	2001	Hungarian	Hungary	NaN	Lower secondary education	...	NaN	NaN	NaN	NaN	NaN	NaN	S2 communication (LifeComp)	NaN	spoken interaction (CEFRL)	NaN

Participant Dataset Generation Example¶

The generator requires two inputs:¶

You can find the generated dataset below.¶

The original and generated datasets' column names need to be mapped based on the datafield labels.¶

There are multiple columns with either too many categories to plot or too many empty rows. These are going to be excluded from the comparison plots.¶