import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
from trainingdatagenerator import TrainingDataGenerator
REQUIRED_ROWS = 100
distribution_file_path = "Trainings Datafields Structure percentages.xlsx"
training_data_generator = TrainingDataGenerator(REQUIRED_ROWS)
df = training_data_generator.create_list_with_defined_distributions(
distribution_file_path
)
df.head()
unique training identification code | target group | training language | related occupation | accreditation | quality assurance | way of teaching | supplementary services | lowest educational attainment | eqf levels | ... | 3. minimum competence level required for the training | 3. completion of the training provides an average level of competence | 4. Framework category | 4. ID and name of existing competence | 4. minimum competence level required for the training | 4. completion of the training provides an average level of competence | 5. Framework category | 5. ID and name of existing competence | 5. minimum competence level required for the training | 5. completion of the training provides an average level of competence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | MT7639 | no target group can be selected | Czech | Non-commissioned Armed Forces Officers | not accredited | Internal QA systems | personal attendance | Group discussion and activities | Primary education | 4.0 | ... | advanced | expert | NaN | NaN | NaN | NaN | DigComp | 4.1 Protecting devices (DigComp) | 4 | 4 |
1 | MT1750 | civil servants | English | Health Associate Professionals | accredited by a national institution | Feedback from participants | online at any time | Hands-on training | Upper secondary education | NaN | ... | basic | expert | DigComp | 2.4 Collaborating through digital technologies... | 7 | 1 | NaN | NaN | NaN | NaN |
2 | MT9661 | no target group can be selected | English | Protective Services Workers | accredited by a national institution | Internal QA systems | online at any time | Case studies or other required reading | Primary education | 4.0 | ... | B2 | B1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | MT8889 | change residence within the EU | Available in any language | Refuse Workers and Other Elementary Workers | accredited by a national institution | Feedback from participants | online at any time | Lectures | Primary education | 2.0 | ... | C1 | B1 | DigComp | 2.2 Sharing through digital technologies (DigC... | 6 | 3 | NaN | NaN | NaN | NaN |
4 | MT3726 | women | Available in any language | Electrical and Electronic Trades Workers | not accredited | Internal QA systems | online at any time | Simulation | Upper secondary education | NaN | ... | expert | expert | DigComp | 5.2 Identifying needs and technological respon... | 1 | 3 | NaN | NaN | NaN | NaN |
5 rows × 39 columns
df.shape
(100, 39)
original_df = pd.read_excel("MASTER database for matching P_v6 & T_v6__v3_dmlab.xlsx")
original_df.head()
Matching_T_code | name of training | target group | secondary target group | tertiary target group | training language | country of training | NUTS2 | related occupation | accreditation | ... | 3. minimum competence level required for the training | 3. completion of the training provides an average level of competence | 4. Framework category | 4. ID and name of existing competence | 4. minimum competence level required for the training | 4. completion of the training provides an average level of competence | 5. Framework category | 5. ID and name of existing competence | 5. minimum competence level required for the training | 5. completion of the training provides an average level of competence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | M_T001 | 50 hours online course in German language | no target group can be selected | NaN | NaN | NaN | NaN | NaN | NaN | not accredited | ... | A1 | B1 | CEFRL | spoken production (CEFRL) | A1 | B1 | CEFRL | writing (CEFRL) | A1 | B1 |
1 | M_T002 | A2 Level Slovak Language Course for Effective ... | immigrants from outside the EU | NaN | NaN | Ukrainian | Slovakia | Slovensko - Bratislavský kraj | unclassifiable occupation | accredited by a national institution | ... | A1 | A2 | CEFRL | reading (CEFRL) | A1 | A2 | CEFRL | spoken production (CEFRL) | A1 | A2 |
2 | M_T003 | Acquiring basic financial competences for ever... | employed | unemployed | NaN | NaN | NaN | NaN | unclassifiable occupation | not accredited | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | M_T004 | Advanced Digital Marketing Strategies for Mana... | employed | NaN | NaN | NaN | Poland | NaN | Managers | not accredited | ... | advanced | expert | EntreComp | Creativity (EntreComp) | basic | advanced | LifeComp | S2 communication (LifeComp) | advanced | advanced |
4 | M_T005 | Advanced Digital Skills for the Digital Age | self-employed | workers in jobs at risk from digitalisation/au... | non-native speakers | English | NaN | NaN | unclassifiable occupation | accredited by a national institution | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 47 columns
original_df = original_df.rename(columns=lambda x: x.strip())
columns_to_plot = list(set(df.columns) & set(original_df.columns))
fixing_types = [". minimum competence level required for the training", ". completion of the training provides an average level of competence"]
for i in range(1, 6):
for col in fixing_types:
df[str(i) + col] = df[str(i) + col].astype(str)
for i in range(1, 6):
for col in fixing_types:
original_df[str(i) + col] = original_df[str(i) + col].astype(str)
exclude_from_plots = [str(i) + ". ID and name of existing competence" for i in range(1, 6)]
num_columns = len([i for i in columns_to_plot if i not in exclude_from_plots])
columns_per_row = 3
num_rows = (num_columns + columns_per_row - 1) // columns_per_row
fig, axes = plt.subplots(num_rows, columns_per_row, figsize=(15, 5 * num_rows), layout="constrained")
axes = axes.flatten()
for col, ax in zip([i for i in columns_to_plot if i not in exclude_from_plots], axes):
new = df[col].tolist()
og = original_df[col].tolist()
data = pd.DataFrame({'new': new, 'original': og, 'column': col})
data = data.melt(id_vars='column')
p = sns.histplot(data=data, x='value', hue='variable', ax=ax)
p.tick_params(axis='x', rotation=90)
ax.set_title(col)