Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
evaluation-report-generator-for-data-viz
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
mozi
evaluation-report-generator-for-data-viz
Commits
e98b464a
Commit
e98b464a
authored
1 year ago
by
alexander06
Browse files
Options
Downloads
Patches
Plain Diff
Remove redundant clean in main pipeline
parent
8ac2d2f9
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/pipeline.ipynb
+4
-10
4 additions, 10 deletions
src/pipeline.ipynb
with
4 additions
and
10 deletions
src/pipeline.ipynb
+
4
−
10
View file @
e98b464a
...
...
@@ -105,11 +105,11 @@
"\n",
"participation = {}\n",
"\n",
"participation_path = os.path.join(CONSTANTS.
RAW
_DATA_PATH, \"participation\")\n",
"participation_path = os.path.join(CONSTANTS.
CLEAN
_DATA_PATH, \"participation\")\n",
"for file in os.listdir(participation_path):\n",
"\tif not file.endswith(\".csv\") or file.startswith(\"example\"): continue\n",
"\tsemester = file.replace(\".csv\", \"\")\n",
"\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file)
, sep=\";\"
)"
"\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file))"
]
},
{
...
...
@@ -143,15 +143,10 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
5
,
"metadata": {},
"outputs": [],
"source": [
"#print(result_frames)\n",
"def norm_name(name: str) -> str:\n",
"\tif ', ' not in name: return name\n",
"\tsplit = name.split(\", \")\n",
"\treturn split[1] + \" \" + split[0]\n",
"for (institute_semester, df) in result_frames.items():\n",
"\tcollection = []\n",
"\tinstitute, semester = institute_semester.split(\"_\") \n",
...
...
@@ -163,8 +158,7 @@
"\t\t# This may lead to data loss but is impossible to work around\n",
"\t\tsemester_obj = participation[semester]\n",
"\t\tsemester_obj.drop_duplicates(inplace=True)\n",
"\t\tnames = semester_obj[\"Person-Name\"]\n",
"\t\tsemester_obj[\"Person-Name\"] = names.map(norm_name)\n",
"\n",
"\t\tdf = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=[\"Veranstaltung-Nr.\", \"Person-Name\", \"Veranstaltung-Name\"], how=\"left\")\n",
"\n",
"\tfor row in df.iterrows():\n",
...
...
%% Cell type:markdown id: tags:
# Evaluation Report Pipeline
This file hosts the pipeline, which generates all images containing the final evaluation results
%% Cell type:code id: tags:
```
python
import
os
,
pandas
,
math
,
json
import
regex
as
re
import
lib.constants
as
CONSTANTS
import
lib.api
as
api
question_cache
=
{}
intervals
=
{}
uuid_regex
=
re
.
compile
(
'
\\
b[0-9a-f]{8}
\\
b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-
\\
b[0-9a-f]{12}
\\
b
'
)
```
%% Cell type:code id: tags:
```
python
def
file_iterator
():
for
(
root
,
_dirs
,
files
)
in
os
.
walk
(
CONSTANTS
.
CLEAN_DATA_PATH
):
if
len
(
files
)
>
0
and
'
all.csv
'
in
files
:
file_path
=
os
.
path
.
join
(
root
,
"
all.csv
"
)
df
=
pandas
.
read_csv
(
file_path
)
yield
root
.
split
(
"
/
"
)[
-
2
:]
+
[
df
]
def
generate_scales
():
result
=
{}
scales
=
api
.
get_scales
()
for
scale
in
scales
:
answers
=
api
.
get_answers_for
(
scale
[
"
id
"
])
result
[
scale
[
"
id
"
]]
=
answers
return
result
def
generate_intervals
():
global
intervals
intervals
=
{
key
:
[
x
[
'
value
'
]
for
x
in
value
]
for
key
,
value
in
generate_scales
().
items
()}
def
get_factors
():
factor
=
{}
for
interval
in
intervals
:
# These intervals either just have two steps or the answers have no weight
if
interval
in
[
"
scale_time
"
,
"
y_n
"
]:
continue
(
a
,
b
)
=
intervals
[
interval
]
factor
[
interval
]
=
round
(
abs
(
a
-
b
-
1
)
/
2
,
0
)
return
factor
# The type of the value is unknown
def
convert_score
(
value
,
factor
)
->
int
:
# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
number_regex
=
re
.
compile
(
'
^\d(\.0)?$
'
)
# Minimum requirement for a number
# Cast to string safeguards real floats
if
number_regex
.
match
(
str
(
value
))
!=
None
:
return
factor
[
int
(
float
(
value
)
-
1
)]
else
:
return
0
def
calc_score
(
series
:
pandas
.
Series
)
->
float
:
if
uuid_regex
.
match
(
series
.
name
)
!=
None
:
# TODO: Negate that one question
if
series
.
name
not
in
question_cache
:
question_cache
[
series
.
name
]
=
intervals
[
api
.
get_scale_for
(
series
.
name
)[
"
id
"
]]
factor
=
question_cache
[
series
.
name
]
# Catches weightless questions
if
len
([
x
for
x
in
factor
if
x
!=
0
])
==
0
:
return
0
if
factor
==
[
1
,
0
]:
return
series
.
map
(
lambda
x
:
1
if
x
==
"
Ja
"
else
0
).
sum
()
# Pandas saves values as float, so they need to be converted
series
=
series
.
map
(
lambda
x
:
convert_score
(
x
,
factor
),
na_action
=
"
ignore
"
)
return
series
.
sum
()
```
%% Cell type:code id: tags:
```
python
# Generate static values
generate_intervals
()
participation
=
{}
participation_path
=
os
.
path
.
join
(
CONSTANTS
.
RAW
_DATA_PATH
,
"
participation
"
)
participation_path
=
os
.
path
.
join
(
CONSTANTS
.
CLEAN
_DATA_PATH
,
"
participation
"
)
for
file
in
os
.
listdir
(
participation_path
):
if
not
file
.
endswith
(
"
.csv
"
)
or
file
.
startswith
(
"
example
"
):
continue
semester
=
file
.
replace
(
"
.csv
"
,
""
)
participation
[
semester
.
replace
(
"
-
"
,
"
/
"
)]
=
pandas
.
read_csv
(
os
.
path
.
join
(
participation_path
,
file
)
,
sep
=
"
;
"
)
participation
[
semester
.
replace
(
"
-
"
,
"
/
"
)]
=
pandas
.
read_csv
(
os
.
path
.
join
(
participation_path
,
file
))
```
%% Cell type:markdown id: tags:
## Calculate weight for Dimension
Calculates the individual score for all dimensions for every course
%% Cell type:code id: tags:
```
python
result_frames
=
{}
for
(
institute
,
semester
,
df
)
in
file_iterator
():
res_df
=
pandas
.
DataFrame
(
data
=
{})
grouped
=
df
.
groupby
([
"
Veranstaltung-Nr.
"
,
"
Person-Name
"
,
"
Veranstaltung-Kategorie
"
,
"
Veranstaltung-Name
"
])
for
group
in
grouped
:
# Keeps the at least five responses constraint
if
len
(
group
[
1
])
<
5
:
continue
head
=
pandas
.
Series
({
"
Veranstaltung-Nr.
"
:
group
[
0
][
0
],
"
Person-Name
"
:
group
[
0
][
1
],
"
Veranstaltung-Typ
"
:
group
[
0
][
2
],
"
Veranstaltung-Name
"
:
group
[
0
][
3
],
"
Antworten
"
:
len
(
group
[
1
])})
result
=
group
[
1
].
dropna
(
axis
=
"
columns
"
,
how
=
"
all
"
).
apply
(
calc_score
).
dropna
(
how
=
"
all
"
)
combined
=
pandas
.
concat
([
head
,
result
])
res_df
=
pandas
.
concat
([
res_df
,
combined
.
to_frame
().
T
])
result_frames
[
"
{0}_{1}
"
.
format
(
institute
,
semester
)]
=
res_df
```
%% Cell type:code id: tags:
```
python
#print(result_frames)
def
norm_name
(
name
:
str
)
->
str
:
if
'
,
'
not
in
name
:
return
name
split
=
name
.
split
(
"
,
"
)
return
split
[
1
]
+
"
"
+
split
[
0
]
for
(
institute_semester
,
df
)
in
result_frames
.
items
():
collection
=
[]
institute
,
semester
=
institute_semester
.
split
(
"
_
"
)
institute
=
CONSTANTS
.
INSTITUTE_MAP
[
institute
]
if
"
WiSe
"
in
semester
:
semester
=
semester
.
replace
(
"
-
"
,
"
/
"
)
if
semester
in
participation
:
# This may lead to data loss but is impossible to work around
semester_obj
=
participation
[
semester
]
semester_obj
.
drop_duplicates
(
inplace
=
True
)
names
=
semester_obj
[
"
Person-Name
"
]
semester_obj
[
"
Person-Name
"
]
=
names
.
map
(
norm_name
)
df
=
df
.
merge
(
semester_obj
[[
'
Veranstaltung-Nr.
'
,
'
Anzahl Teilnehmende
'
,
'
Person-Name
'
,
'
Veranstaltung-Name
'
]],
on
=
[
"
Veranstaltung-Nr.
"
,
"
Person-Name
"
,
"
Veranstaltung-Name
"
],
how
=
"
left
"
)
for
row
in
df
.
iterrows
():
course_object
=
{
"
course_number
"
:
str
(
row
[
1
][
"
Veranstaltung-Nr.
"
]),
"
course_name
"
:
row
[
1
][
"
Veranstaltung-Name
"
],
"
course_type
"
:
row
[
1
][
"
Veranstaltung-Typ
"
],
"
lecturer
"
:
row
[
1
][
"
Person-Name
"
],
"
semester
"
:
semester
,
"
institute
"
:
institute
,
"
answers
"
:
row
[
1
][
"
Antworten
"
],
# -1 indicated that no data is available
"
participants
"
:
-
1
,
"
scores
"
:
[]
}
if
"
Anzahl Teilnehmende
"
in
df
.
columns
:
if
not
math
.
isnan
(
row
[
1
][
"
Anzahl Teilnehmende
"
]):
course_object
[
"
participants
"
]
=
int
(
row
[
1
][
"
Anzahl Teilnehmende
"
])
for
col
in
row
[
1
].
index
:
if
uuid_regex
.
match
(
col
)
!=
None
:
# Early Break for irrelevant columns
if
math
.
isnan
(
row
[
1
][
col
]):
continue
course_object
[
"
scores
"
].
append
({
"
question
"
:
col
,
"
score
"
:
int
(
row
[
1
][
col
])})
collection
.
append
(
course_object
)
api
.
force_courses
(
collection
)
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment