Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
evaluation-report-generator
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
alexander06
evaluation-report-generator
Commits
e98b464a
Commit
e98b464a
authored
1 year ago
by
alexander06
Browse files
Options
Downloads
Patches
Plain Diff
Remove redundant clean in main pipeline
parent
8ac2d2f9
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/pipeline.ipynb
+4
-10
4 additions, 10 deletions
src/pipeline.ipynb
with
4 additions
and
10 deletions
src/pipeline.ipynb
+
4
−
10
View file @
e98b464a
...
@@ -105,11 +105,11 @@
...
@@ -105,11 +105,11 @@
"\n",
"\n",
"participation = {}\n",
"participation = {}\n",
"\n",
"\n",
"participation_path = os.path.join(CONSTANTS.
RAW
_DATA_PATH, \"participation\")\n",
"participation_path = os.path.join(CONSTANTS.
CLEAN
_DATA_PATH, \"participation\")\n",
"for file in os.listdir(participation_path):\n",
"for file in os.listdir(participation_path):\n",
"\tif not file.endswith(\".csv\") or file.startswith(\"example\"): continue\n",
"\tif not file.endswith(\".csv\") or file.startswith(\"example\"): continue\n",
"\tsemester = file.replace(\".csv\", \"\")\n",
"\tsemester = file.replace(\".csv\", \"\")\n",
"\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file)
, sep=\";\"
)"
"\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file))"
]
]
},
},
{
{
...
@@ -143,15 +143,10 @@
...
@@ -143,15 +143,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
null
,
"execution_count":
5
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"#print(result_frames)\n",
"def norm_name(name: str) -> str:\n",
"\tif ', ' not in name: return name\n",
"\tsplit = name.split(\", \")\n",
"\treturn split[1] + \" \" + split[0]\n",
"for (institute_semester, df) in result_frames.items():\n",
"for (institute_semester, df) in result_frames.items():\n",
"\tcollection = []\n",
"\tcollection = []\n",
"\tinstitute, semester = institute_semester.split(\"_\") \n",
"\tinstitute, semester = institute_semester.split(\"_\") \n",
...
@@ -163,8 +158,7 @@
...
@@ -163,8 +158,7 @@
"\t\t# This may lead to data loss but is impossible to work around\n",
"\t\t# This may lead to data loss but is impossible to work around\n",
"\t\tsemester_obj = participation[semester]\n",
"\t\tsemester_obj = participation[semester]\n",
"\t\tsemester_obj.drop_duplicates(inplace=True)\n",
"\t\tsemester_obj.drop_duplicates(inplace=True)\n",
"\t\tnames = semester_obj[\"Person-Name\"]\n",
"\n",
"\t\tsemester_obj[\"Person-Name\"] = names.map(norm_name)\n",
"\t\tdf = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=[\"Veranstaltung-Nr.\", \"Person-Name\", \"Veranstaltung-Name\"], how=\"left\")\n",
"\t\tdf = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=[\"Veranstaltung-Nr.\", \"Person-Name\", \"Veranstaltung-Name\"], how=\"left\")\n",
"\n",
"\n",
"\tfor row in df.iterrows():\n",
"\tfor row in df.iterrows():\n",
...
...
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Evaluation Report Pipeline
# Evaluation Report Pipeline
This file hosts the pipeline, which generates all images containing the final evaluation results
This file hosts the pipeline, which generates all images containing the final evaluation results
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
import
os
,
pandas
,
math
,
json
import
os
,
pandas
,
math
,
json
import
regex
as
re
import
regex
as
re
import
lib.constants
as
CONSTANTS
import
lib.constants
as
CONSTANTS
import
lib.api
as
api
import
lib.api
as
api
question_cache
=
{}
question_cache
=
{}
intervals
=
{}
intervals
=
{}
uuid_regex
=
re
.
compile
(
'
\\
b[0-9a-f]{8}
\\
b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-
\\
b[0-9a-f]{12}
\\
b
'
)
uuid_regex
=
re
.
compile
(
'
\\
b[0-9a-f]{8}
\\
b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-
\\
b[0-9a-f]{12}
\\
b
'
)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
def
file_iterator
():
def
file_iterator
():
for
(
root
,
_dirs
,
files
)
in
os
.
walk
(
CONSTANTS
.
CLEAN_DATA_PATH
):
for
(
root
,
_dirs
,
files
)
in
os
.
walk
(
CONSTANTS
.
CLEAN_DATA_PATH
):
if
len
(
files
)
>
0
and
'
all.csv
'
in
files
:
if
len
(
files
)
>
0
and
'
all.csv
'
in
files
:
file_path
=
os
.
path
.
join
(
root
,
"
all.csv
"
)
file_path
=
os
.
path
.
join
(
root
,
"
all.csv
"
)
df
=
pandas
.
read_csv
(
file_path
)
df
=
pandas
.
read_csv
(
file_path
)
yield
root
.
split
(
"
/
"
)[
-
2
:]
+
[
df
]
yield
root
.
split
(
"
/
"
)[
-
2
:]
+
[
df
]
def
generate_scales
():
def
generate_scales
():
result
=
{}
result
=
{}
scales
=
api
.
get_scales
()
scales
=
api
.
get_scales
()
for
scale
in
scales
:
for
scale
in
scales
:
answers
=
api
.
get_answers_for
(
scale
[
"
id
"
])
answers
=
api
.
get_answers_for
(
scale
[
"
id
"
])
result
[
scale
[
"
id
"
]]
=
answers
result
[
scale
[
"
id
"
]]
=
answers
return
result
return
result
def
generate_intervals
():
def
generate_intervals
():
global
intervals
global
intervals
intervals
=
{
key
:
[
x
[
'
value
'
]
for
x
in
value
]
for
key
,
value
in
generate_scales
().
items
()}
intervals
=
{
key
:
[
x
[
'
value
'
]
for
x
in
value
]
for
key
,
value
in
generate_scales
().
items
()}
def
get_factors
():
def
get_factors
():
factor
=
{}
factor
=
{}
for
interval
in
intervals
:
for
interval
in
intervals
:
# These intervals either just have two steps or the answers have no weight
# These intervals either just have two steps or the answers have no weight
if
interval
in
[
"
scale_time
"
,
"
y_n
"
]:
continue
if
interval
in
[
"
scale_time
"
,
"
y_n
"
]:
continue
(
a
,
b
)
=
intervals
[
interval
]
(
a
,
b
)
=
intervals
[
interval
]
factor
[
interval
]
=
round
(
abs
(
a
-
b
-
1
)
/
2
,
0
)
factor
[
interval
]
=
round
(
abs
(
a
-
b
-
1
)
/
2
,
0
)
return
factor
return
factor
# The type of the value is unknown
# The type of the value is unknown
def
convert_score
(
value
,
factor
)
->
int
:
def
convert_score
(
value
,
factor
)
->
int
:
# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
number_regex
=
re
.
compile
(
'
^\d(\.0)?$
'
)
number_regex
=
re
.
compile
(
'
^\d(\.0)?$
'
)
# Minimum requirement for a number
# Minimum requirement for a number
# Cast to string safeguards real floats
# Cast to string safeguards real floats
if
number_regex
.
match
(
str
(
value
))
!=
None
:
if
number_regex
.
match
(
str
(
value
))
!=
None
:
return
factor
[
int
(
float
(
value
)
-
1
)]
return
factor
[
int
(
float
(
value
)
-
1
)]
else
:
else
:
return
0
return
0
def
calc_score
(
series
:
pandas
.
Series
)
->
float
:
def
calc_score
(
series
:
pandas
.
Series
)
->
float
:
if
uuid_regex
.
match
(
series
.
name
)
!=
None
:
if
uuid_regex
.
match
(
series
.
name
)
!=
None
:
# TODO: Negate that one question
# TODO: Negate that one question
if
series
.
name
not
in
question_cache
:
if
series
.
name
not
in
question_cache
:
question_cache
[
series
.
name
]
=
intervals
[
api
.
get_scale_for
(
series
.
name
)[
"
id
"
]]
question_cache
[
series
.
name
]
=
intervals
[
api
.
get_scale_for
(
series
.
name
)[
"
id
"
]]
factor
=
question_cache
[
series
.
name
]
factor
=
question_cache
[
series
.
name
]
# Catches weightless questions
# Catches weightless questions
if
len
([
x
for
x
in
factor
if
x
!=
0
])
==
0
:
if
len
([
x
for
x
in
factor
if
x
!=
0
])
==
0
:
return
0
return
0
if
factor
==
[
1
,
0
]:
if
factor
==
[
1
,
0
]:
return
series
.
map
(
lambda
x
:
1
if
x
==
"
Ja
"
else
0
).
sum
()
return
series
.
map
(
lambda
x
:
1
if
x
==
"
Ja
"
else
0
).
sum
()
# Pandas saves values as float, so they need to be converted
# Pandas saves values as float, so they need to be converted
series
=
series
.
map
(
lambda
x
:
convert_score
(
x
,
factor
),
na_action
=
"
ignore
"
)
series
=
series
.
map
(
lambda
x
:
convert_score
(
x
,
factor
),
na_action
=
"
ignore
"
)
return
series
.
sum
()
return
series
.
sum
()
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
# Generate static values
# Generate static values
generate_intervals
()
generate_intervals
()
participation
=
{}
participation
=
{}
participation_path
=
os
.
path
.
join
(
CONSTANTS
.
RAW
_DATA_PATH
,
"
participation
"
)
participation_path
=
os
.
path
.
join
(
CONSTANTS
.
CLEAN
_DATA_PATH
,
"
participation
"
)
for
file
in
os
.
listdir
(
participation_path
):
for
file
in
os
.
listdir
(
participation_path
):
if
not
file
.
endswith
(
"
.csv
"
)
or
file
.
startswith
(
"
example
"
):
continue
if
not
file
.
endswith
(
"
.csv
"
)
or
file
.
startswith
(
"
example
"
):
continue
semester
=
file
.
replace
(
"
.csv
"
,
""
)
semester
=
file
.
replace
(
"
.csv
"
,
""
)
participation
[
semester
.
replace
(
"
-
"
,
"
/
"
)]
=
pandas
.
read_csv
(
os
.
path
.
join
(
participation_path
,
file
)
,
sep
=
"
;
"
)
participation
[
semester
.
replace
(
"
-
"
,
"
/
"
)]
=
pandas
.
read_csv
(
os
.
path
.
join
(
participation_path
,
file
))
```
```
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Calculate weight for Dimension
## Calculate weight for Dimension
Calculates the individual score for all dimensions for every course
Calculates the individual score for all dimensions for every course
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
result_frames
=
{}
result_frames
=
{}
for
(
institute
,
semester
,
df
)
in
file_iterator
():
for
(
institute
,
semester
,
df
)
in
file_iterator
():
res_df
=
pandas
.
DataFrame
(
data
=
{})
res_df
=
pandas
.
DataFrame
(
data
=
{})
grouped
=
df
.
groupby
([
"
Veranstaltung-Nr.
"
,
"
Person-Name
"
,
"
Veranstaltung-Kategorie
"
,
"
Veranstaltung-Name
"
])
grouped
=
df
.
groupby
([
"
Veranstaltung-Nr.
"
,
"
Person-Name
"
,
"
Veranstaltung-Kategorie
"
,
"
Veranstaltung-Name
"
])
for
group
in
grouped
:
for
group
in
grouped
:
# Keeps the at least five responses constraint
# Keeps the at least five responses constraint
if
len
(
group
[
1
])
<
5
:
if
len
(
group
[
1
])
<
5
:
continue
continue
head
=
pandas
.
Series
({
"
Veranstaltung-Nr.
"
:
group
[
0
][
0
],
"
Person-Name
"
:
group
[
0
][
1
],
"
Veranstaltung-Typ
"
:
group
[
0
][
2
],
"
Veranstaltung-Name
"
:
group
[
0
][
3
],
"
Antworten
"
:
len
(
group
[
1
])})
head
=
pandas
.
Series
({
"
Veranstaltung-Nr.
"
:
group
[
0
][
0
],
"
Person-Name
"
:
group
[
0
][
1
],
"
Veranstaltung-Typ
"
:
group
[
0
][
2
],
"
Veranstaltung-Name
"
:
group
[
0
][
3
],
"
Antworten
"
:
len
(
group
[
1
])})
result
=
group
[
1
].
dropna
(
axis
=
"
columns
"
,
how
=
"
all
"
).
apply
(
calc_score
).
dropna
(
how
=
"
all
"
)
result
=
group
[
1
].
dropna
(
axis
=
"
columns
"
,
how
=
"
all
"
).
apply
(
calc_score
).
dropna
(
how
=
"
all
"
)
combined
=
pandas
.
concat
([
head
,
result
])
combined
=
pandas
.
concat
([
head
,
result
])
res_df
=
pandas
.
concat
([
res_df
,
combined
.
to_frame
().
T
])
res_df
=
pandas
.
concat
([
res_df
,
combined
.
to_frame
().
T
])
result_frames
[
"
{0}_{1}
"
.
format
(
institute
,
semester
)]
=
res_df
result_frames
[
"
{0}_{1}
"
.
format
(
institute
,
semester
)]
=
res_df
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
#print(result_frames)
def
norm_name
(
name
:
str
)
->
str
:
if
'
,
'
not
in
name
:
return
name
split
=
name
.
split
(
"
,
"
)
return
split
[
1
]
+
"
"
+
split
[
0
]
for
(
institute_semester
,
df
)
in
result_frames
.
items
():
for
(
institute_semester
,
df
)
in
result_frames
.
items
():
collection
=
[]
collection
=
[]
institute
,
semester
=
institute_semester
.
split
(
"
_
"
)
institute
,
semester
=
institute_semester
.
split
(
"
_
"
)
institute
=
CONSTANTS
.
INSTITUTE_MAP
[
institute
]
institute
=
CONSTANTS
.
INSTITUTE_MAP
[
institute
]
if
"
WiSe
"
in
semester
:
if
"
WiSe
"
in
semester
:
semester
=
semester
.
replace
(
"
-
"
,
"
/
"
)
semester
=
semester
.
replace
(
"
-
"
,
"
/
"
)
if
semester
in
participation
:
if
semester
in
participation
:
# This may lead to data loss but is impossible to work around
# This may lead to data loss but is impossible to work around
semester_obj
=
participation
[
semester
]
semester_obj
=
participation
[
semester
]
semester_obj
.
drop_duplicates
(
inplace
=
True
)
semester_obj
.
drop_duplicates
(
inplace
=
True
)
names
=
semester_obj
[
"
Person-Name
"
]
semester_obj
[
"
Person-Name
"
]
=
names
.
map
(
norm_name
)
df
=
df
.
merge
(
semester_obj
[[
'
Veranstaltung-Nr.
'
,
'
Anzahl Teilnehmende
'
,
'
Person-Name
'
,
'
Veranstaltung-Name
'
]],
on
=
[
"
Veranstaltung-Nr.
"
,
"
Person-Name
"
,
"
Veranstaltung-Name
"
],
how
=
"
left
"
)
df
=
df
.
merge
(
semester_obj
[[
'
Veranstaltung-Nr.
'
,
'
Anzahl Teilnehmende
'
,
'
Person-Name
'
,
'
Veranstaltung-Name
'
]],
on
=
[
"
Veranstaltung-Nr.
"
,
"
Person-Name
"
,
"
Veranstaltung-Name
"
],
how
=
"
left
"
)
for
row
in
df
.
iterrows
():
for
row
in
df
.
iterrows
():
course_object
=
{
course_object
=
{
"
course_number
"
:
str
(
row
[
1
][
"
Veranstaltung-Nr.
"
]),
"
course_number
"
:
str
(
row
[
1
][
"
Veranstaltung-Nr.
"
]),
"
course_name
"
:
row
[
1
][
"
Veranstaltung-Name
"
],
"
course_name
"
:
row
[
1
][
"
Veranstaltung-Name
"
],
"
course_type
"
:
row
[
1
][
"
Veranstaltung-Typ
"
],
"
course_type
"
:
row
[
1
][
"
Veranstaltung-Typ
"
],
"
lecturer
"
:
row
[
1
][
"
Person-Name
"
],
"
lecturer
"
:
row
[
1
][
"
Person-Name
"
],
"
semester
"
:
semester
,
"
semester
"
:
semester
,
"
institute
"
:
institute
,
"
institute
"
:
institute
,
"
answers
"
:
row
[
1
][
"
Antworten
"
],
"
answers
"
:
row
[
1
][
"
Antworten
"
],
# -1 indicated that no data is available
# -1 indicated that no data is available
"
participants
"
:
-
1
,
"
participants
"
:
-
1
,
"
scores
"
:
[]
"
scores
"
:
[]
}
}
if
"
Anzahl Teilnehmende
"
in
df
.
columns
:
if
"
Anzahl Teilnehmende
"
in
df
.
columns
:
if
not
math
.
isnan
(
row
[
1
][
"
Anzahl Teilnehmende
"
]):
if
not
math
.
isnan
(
row
[
1
][
"
Anzahl Teilnehmende
"
]):
course_object
[
"
participants
"
]
=
int
(
row
[
1
][
"
Anzahl Teilnehmende
"
])
course_object
[
"
participants
"
]
=
int
(
row
[
1
][
"
Anzahl Teilnehmende
"
])
for
col
in
row
[
1
].
index
:
for
col
in
row
[
1
].
index
:
if
uuid_regex
.
match
(
col
)
!=
None
:
if
uuid_regex
.
match
(
col
)
!=
None
:
# Early Break for irrelevant columns
# Early Break for irrelevant columns
if
math
.
isnan
(
row
[
1
][
col
]):
if
math
.
isnan
(
row
[
1
][
col
]):
continue
continue
course_object
[
"
scores
"
].
append
({
"
question
"
:
col
,
"
score
"
:
int
(
row
[
1
][
col
])})
course_object
[
"
scores
"
].
append
({
"
question
"
:
col
,
"
score
"
:
int
(
row
[
1
][
col
])})
collection
.
append
(
course_object
)
collection
.
append
(
course_object
)
api
.
force_courses
(
collection
)
api
.
force_courses
(
collection
)
```
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment