Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Genome_Evaluation_Pipeline
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
cmazzoni
Genome_Evaluation_Pipeline
Commits
5737dd5e
Commit
5737dd5e
authored
3 years ago
by
james94
Browse files
Options
Downloads
Patches
Plain Diff
stats indent error
parent
d7f0b685
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/stats.py
+197
-195
197 additions, 195 deletions
scripts/stats.py
with
197 additions
and
195 deletions
scripts/stats.py
+
197
−
195
View file @
5737dd5e
...
@@ -25,165 +25,167 @@ else:
...
@@ -25,165 +25,167 @@ else:
size_bp
=
estSize
size_bp
=
estSize
def
fasta_iter
(
fasta_file
):
def
fasta_iter
(
fasta_file
):
"""
Takes a FASTA file, and produces a generator of Header and Sequences.
"""
This is a memory-efficient way of analyzing a FASTA files -- without
Takes a FASTA file, and produces a generator of Header and Sequences.
reading the entire file into memory.
This is a memory-efficient way of analyzing a FASTA files -- without
reading the entire file into memory.
Parameters
Parameters
----------
----------
fasta_file : str
fasta_file : str
The file location of the FASTA file
The file location of the FASTA file
Returns
Returns
-------
-------
header: str
header: str
The string contained in the header portion of the sequence record
The string contained in the header portion of the sequence record
(everything after the
'
>
'
)
(everything after the
'
>
'
)
seq: str
seq: str
The sequence portion of the sequence record
The sequence portion of the sequence record
"""
"""
fh
=
open
(
fasta_file
)
fh
=
open
(
fasta_file
)
fa_iter
=
(
x
[
1
]
for
x
in
groupby
(
fh
,
lambda
line
:
line
[
0
]
==
"
>
"
))
fa_iter
=
(
x
[
1
]
for
x
in
groupby
(
fh
,
lambda
line
:
line
[
0
]
==
"
>
"
))
for
header
in
fa_iter
:
for
header
in
fa_iter
:
# drop the ">"
# drop the ">"
header
=
next
(
header
)[
1
:].
strip
()
header
=
next
(
header
)[
1
:].
strip
()
# join all sequence lines to one.
# join all sequence lines to one.
seq
=
""
.
join
(
s
.
upper
().
strip
()
for
s
in
next
(
fa_iter
))
seq
=
""
.
join
(
s
.
upper
().
strip
()
for
s
in
next
(
fa_iter
))
yield
header
,
seq
yield
header
,
seq
def
read_genome
(
fasta_file
):
def
read_genome
(
fasta_file
):
"""
Takes a FASTA file, and produces 2 lists of sequence lengths. It also
"""
calculates the GC Content, since this is the only statistic that is not
Takes a FASTA file, and produces 2 lists of sequence lengths. It also
calculated based on sequence lengths.
calculates the GC Content, since this is the only statistic that is not
calculated based on sequence lengths.
Parameters
Parameters
----------
----------
fasta_file : str
fasta_file : str
The file location of the FASTA file
The file location of the FASTA file
Returns
Returns
-
------
------
contig_lens: list
contig_lens: list
A list of lengths of all contigs in the genome.
A list of lengths of all contigs in the genome.
scaffold_lens: list
scaffold_lens: list
A list of lengths of all scaffolds in the genome.
A list of lengths of all scaffolds in the genome.
gc_cont: float
gc_cont: float
The percentage of total basepairs in the genome that are either G or C.
The percentage of total basepairs in the genome that are either G or C.
"""
"""
gc
=
0
gc
=
0
total_len
=
0
total_len
=
0
contig_lens
=
[]
contig_lens
=
[]
scaffold_lens
=
[]
scaffold_lens
=
[]
for
_
,
seq
in
fasta_iter
(
fasta_file
):
for
_
,
seq
in
fasta_iter
(
fasta_file
):
scaffold_lens
.
append
(
len
(
seq
))
scaffold_lens
.
append
(
len
(
seq
))
if
"
NN
"
in
seq
:
if
"
NN
"
in
seq
:
contig_list
=
seq
.
split
(
"
NN
"
)
contig_list
=
seq
.
split
(
"
NN
"
)
else
:
else
:
contig_list
=
[
seq
]
contig_list
=
[
seq
]
for
contig
in
contig_list
:
for
contig
in
contig_list
:
if
len
(
contig
):
if
len
(
contig
):
gc
+=
contig
.
count
(
'
G
'
)
+
contig
.
count
(
'
C
'
)
gc
+=
contig
.
count
(
'
G
'
)
+
contig
.
count
(
'
C
'
)
total_len
+=
len
(
contig
)
total_len
+=
len
(
contig
)
contig_lens
.
append
(
len
(
contig
))
contig_lens
.
append
(
len
(
contig
))
gc_cont
=
(
gc
/
total_len
)
*
100
gc_cont
=
(
gc
/
total_len
)
*
100
return
contig_lens
,
scaffold_lens
,
gc_cont
return
contig_lens
,
scaffold_lens
,
gc_cont
def
calculate_stats
(
seq_lens
,
gc_cont
):
def
calculate_stats
(
seq_lens
,
gc_cont
):
naming
=
sys
.
argv
[
3
]
naming
=
sys
.
argv
[
3
]
stats
=
{}
stats
=
{}
seq_array
=
np
.
array
(
seq_lens
)
seq_array
=
np
.
array
(
seq_lens
)
# stats['Assembly:']=naming
# stats['Assembly:']=naming
stats
[
'
sequence_count
'
]
=
seq_array
.
size
stats
[
'
sequence_count
'
]
=
seq_array
.
size
testsize
=
stats
[
'
sequence_count
'
]
testsize
=
stats
[
'
sequence_count
'
]
stats
[
'
number_of_gaps
'
]
=
0
stats
[
'
number_of_gaps
'
]
=
0
# print("this is the count",naming," ", testsize)
# print("this is the count",naming," ", testsize)
stats
[
'
gc_content (%)
'
]
=
gc_cont
stats
[
'
gc_content (%)
'
]
=
gc_cont
sorted_lens
=
seq_array
[
np
.
argsort
(
-
seq_array
)]
sorted_lens
=
seq_array
[
np
.
argsort
(
-
seq_array
)]
stats
[
'
longest (bp)
'
]
=
int
(
sorted_lens
[
0
])
stats
[
'
longest (bp)
'
]
=
int
(
sorted_lens
[
0
])
testlongest
=
stats
[
'
longest (bp)
'
]
testlongest
=
stats
[
'
longest (bp)
'
]
# print("this is the longest", naming," ",testlongest)
# print("this is the longest", naming," ",testlongest)
stats
[
'
shortest (bp)
'
]
=
int
(
sorted_lens
[
-
1
])
stats
[
'
shortest (bp)
'
]
=
int
(
sorted_lens
[
-
1
])
# stats['median'] = np.median(sorted_lens)
# stats['median'] = np.median(sorted_lens)
# stats['mean'] = np.mean(sorted_lens)
# stats['mean'] = np.mean(sorted_lens)
stats
[
'
total_bps (bp)
'
]
=
int
(
np
.
sum
(
sorted_lens
))
stats
[
'
total_bps (bp)
'
]
=
int
(
np
.
sum
(
sorted_lens
))
testprint
=
stats
[
'
total_bps (bp)
'
]
testprint
=
stats
[
'
total_bps (bp)
'
]
# print("total_bp is", naming," ",testprint)
# print("total_bp is", naming," ",testprint)
stats
[
'
estimated_size (bp)
'
]
=
int
(
size_bp
)
stats
[
'
estimated_size (bp)
'
]
=
int
(
size_bp
)
csum
=
np
.
cumsum
(
sorted_lens
)
csum
=
np
.
cumsum
(
sorted_lens
)
# if stats['total_bps (bp)'] < stats['estimated_size (bp)']:
# if stats['total_bps (bp)'] < stats['estimated_size (bp)']:
# csum_ng = np.append(csum, stats['estimated_size (bp)'])
# csum_ng = np.append(csum, stats['estimated_size (bp)'])
# else:
# else:
# csum_ng=csum
# csum_ng=csum
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
nx
=
int
(
stats
[
'
total_bps (bp)
'
]
*
(
level
/
100
))
nx
=
int
(
stats
[
'
total_bps (bp)
'
]
*
(
level
/
100
))
csumn
=
min
(
csum
[
csum
>=
nx
])
csumn
=
min
(
csum
[
csum
>=
nx
])
l_level
=
int
(
np
.
where
(
csum
==
csumn
)[
0
])
+
1
l_level
=
int
(
np
.
where
(
csum
==
csumn
)[
0
])
+
1
stats
[
'
L
'
+
str
(
level
)]
=
l_level
stats
[
'
L
'
+
str
(
level
)]
=
l_level
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
# print("the totalbps are:", stats['total_bps (bp)'])
# print("the totalbps are:", stats['total_bps (bp)'])
nx
=
int
(
stats
[
'
total_bps (bp)
'
]
*
(
level
/
100
))
nx
=
int
(
stats
[
'
total_bps (bp)
'
]
*
(
level
/
100
))
# print("this is the nx", nx)
# print("this is the nx", nx)
# print("this is the csum", csum)
# print("this is the csum", csum)
csumn
=
min
(
csum
[
csum
>=
nx
])
csumn
=
min
(
csum
[
csum
>=
nx
])
# print("this is the csumn", csumn)
# print("this is the csumn", csumn)
l_level
=
int
(
np
.
where
(
csum
==
csumn
)[
0
])
l_level
=
int
(
np
.
where
(
csum
==
csumn
)[
0
])
n_level
=
int
(
sorted_lens
[
l_level
])
n_level
=
int
(
sorted_lens
[
l_level
])
stats
[
'
N
'
+
str
(
level
)]
=
n_level
stats
[
'
N
'
+
str
(
level
)]
=
n_level
# print(level, " ", n_level)
# print(level, " ", n_level)
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
# print("the estbps are:", stats['estimated_size (bp)'])
# print("the estbps are:", stats['estimated_size (bp)'])
ngx
=
int
(
stats
[
'
estimated_size (bp)
'
]
*
(
level
/
100
))
ngx
=
int
(
stats
[
'
estimated_size (bp)
'
]
*
(
level
/
100
))
# print("this is the ngx", ngx)
# print("this is the ngx", ngx)
# print("this is the csum", csum_ng)
# print("this is the csum", csum_ng)
# print("this is the csum", csum)
# print("this is the csum", csum)
# print("this is the [csum >= ngx]", np.array(csum >= ngx))
# print("this is the [csum >= ngx]", np.array(csum >= ngx))
new_array
=
np
.
array
(
csum
>=
ngx
)
new_array
=
np
.
array
(
csum
>=
ngx
)
# print(np.any(new_array))
# print(np.any(new_array))
if
np
.
any
(
new_array
)
==
False
:
if
np
.
any
(
new_array
)
==
False
:
csumng
=
csum
[
seq_array
.
size
-
1
]
csumng
=
csum
[
seq_array
.
size
-
1
]
#
print("this is the csumng", csumng)
# print("this is the csumng", csumng)
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
+
1
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
+
1
stats
[
'
LG
'
+
str
(
level
)]
=
lg_level
stats
[
'
LG
'
+
str
(
level
)]
=
lg_level
elif
np
.
any
(
new_array
)
==
True
:
elif
np
.
any
(
new_array
)
==
True
:
csumng
=
min
(
csum
[
csum
>=
ngx
])
csumng
=
min
(
csum
[
csum
>=
ngx
])
#
print("this is the csumng", csumng)
# print("this is the csumng", csumng)
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
+
1
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
+
1
stats
[
'
LG
'
+
str
(
level
)]
=
lg_level
stats
[
'
LG
'
+
str
(
level
)]
=
lg_level
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
for
level
in
[
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
,
70
,
75
,
80
,
85
,
90
,
95
,
96
,
97
,
98
,
99
,
100
]:
ngx
=
int
(
stats
[
'
estimated_size (bp)
'
]
*
(
level
/
100
))
ngx
=
int
(
stats
[
'
estimated_size (bp)
'
]
*
(
level
/
100
))
new_array
=
np
.
array
(
csum
>=
ngx
)
new_array
=
np
.
array
(
csum
>=
ngx
)
# print(np.any(new_array))
# print(np.any(new_array))
if
np
.
any
(
new_array
)
==
False
:
if
np
.
any
(
new_array
)
==
False
:
csumng
=
csum
[
seq_array
.
size
-
1
]
csumng
=
csum
[
seq_array
.
size
-
1
]
# print("this is the csumng", csumng)
# print("this is the csumng", csumng)
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
ng_level
=
int
(
sorted_lens
[
lg_level
])
ng_level
=
int
(
sorted_lens
[
lg_level
])
stats
[
'
NG
'
+
str
(
level
)]
=
ng_level
stats
[
'
NG
'
+
str
(
level
)]
=
ng_level
elif
np
.
any
(
new_array
)
==
True
:
elif
np
.
any
(
new_array
)
==
True
:
csumng
=
min
(
csum
[
csum
>=
ngx
])
csumng
=
min
(
csum
[
csum
>=
ngx
])
# print("this is the csumng", csumng)
# print("this is the csumng", csumng)
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
lg_level
=
int
(
np
.
where
(
csum
==
csumng
)[
0
])
ng_level
=
int
(
sorted_lens
[
lg_level
])
ng_level
=
int
(
sorted_lens
[
lg_level
])
stats
[
'
NG
'
+
str
(
level
)]
=
ng_level
stats
[
'
NG
'
+
str
(
level
)]
=
ng_level
return
stats
return
stats
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
# print(size_bp)
# print(size_bp)
# print(type(size_bp))
# print(type(size_bp))
naming
=
sys
.
argv
[
3
]
naming
=
sys
.
argv
[
3
]
infilename
=
sys
.
argv
[
1
]
infilename
=
sys
.
argv
[
1
]
contig_lens
,
scaffold_lens
,
gc_cont
=
read_genome
(
infilename
)
contig_lens
,
scaffold_lens
,
gc_cont
=
read_genome
(
infilename
)
# contig_stats = calculate_stats(contig_lens, gc_cont)
# contig_stats = calculate_stats(contig_lens, gc_cont)
scaffold_stats
=
calculate_stats
(
scaffold_lens
,
gc_cont
)
scaffold_stats
=
calculate_stats
(
scaffold_lens
,
gc_cont
)
contig_stats
=
calculate_stats
(
contig_lens
,
gc_cont
)
contig_stats
=
calculate_stats
(
contig_lens
,
gc_cont
)
gaps
=
contig_stats
.
get
(
'
sequence_count
'
)
-
scaffold_stats
.
get
(
'
sequence_count
'
)
gaps
=
contig_stats
.
get
(
'
sequence_count
'
)
-
scaffold_stats
.
get
(
'
sequence_count
'
)
scaffold_stats
[
'
number_of_gaps
'
]
=
gaps
scaffold_stats
[
'
number_of_gaps
'
]
=
gaps
contig_stats
[
'
number_of_gaps
'
]
=
gaps
contig_stats
[
'
number_of_gaps
'
]
=
gaps
# print(scaffold_stats)
# print(scaffold_stats)
# df_scaffold_all= pd.DataFrame.from_dict(scaffold_stats, orient= 'index')
# df_scaffold_all= pd.DataFrame.from_dict(scaffold_stats, orient= 'index')
# print(df_scaffold_all)
# print(df_scaffold_all)
...
@@ -193,72 +195,72 @@ if __name__ == "__main__":
...
@@ -193,72 +195,72 @@ if __name__ == "__main__":
# s.index.name = 'Assembly:'
# s.index.name = 'Assembly:'
# s.reset_index()
# s.reset_index()
# print(s)
# print(s)
scaff_seq
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
Assembly:
'
,
naming
])
scaff_seq
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
Assembly:
'
,
naming
])
df_scaffold_top
=
scaff_seq
.
iloc
[
0
:
7
,
0
:
2
]
df_scaffold_top
=
scaff_seq
.
iloc
[
0
:
7
,
0
:
2
]
df_scaffold_top
[
naming
]
=
df_scaffold_top
[
naming
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_top
[
naming
]
=
df_scaffold_top
[
naming
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
# df_scaffold_top=df_scaffold_top.style.hide_index()
# df_scaffold_top=df_scaffold_top.style.hide_index()
# df_scaffold_top[naming].round(decimals=0)
# df_scaffold_top[naming].round(decimals=0)
# df_contig_all=pd.DataFrame(data=contig_stats)
# df_contig_all=pd.DataFrame(data=contig_stats)
# df_contig_top=df_contig_all.iloc[0:6,0:2]
# df_contig_top=df_contig_all.iloc[0:6,0:2]
df_scaffold_Nxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
Nxx Level (%)
'
,
'
Length of Nxx Scaffold (bp)
'
])
df_scaffold_Nxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
Nxx Level (%)
'
,
'
Length of Nxx Scaffold (bp)
'
])
df_scaffold_Nxx
=
df_scaffold_Nxx
.
iloc
[
31
:
55
,
0
:
2
]
df_scaffold_Nxx
=
df_scaffold_Nxx
.
iloc
[
31
:
55
,
0
:
2
]
df_scaffold_Nxx
=
df_scaffold_Nxx
.
reset_index
()
df_scaffold_Nxx
=
df_scaffold_Nxx
.
reset_index
()
# print(df_scaffold_Nxx)
# print(df_scaffold_Nxx)
df_scaffold_NGxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
NGxx Level (%)
'
,
'
Length of NGxx Scaffold (bp)
'
])
df_scaffold_NGxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
NGxx Level (%)
'
,
'
Length of NGxx Scaffold (bp)
'
])
df_scaffold_NGxx
=
df_scaffold_NGxx
.
iloc
[
79
:
104
,
0
:
2
]
df_scaffold_NGxx
=
df_scaffold_NGxx
.
iloc
[
79
:
104
,
0
:
2
]
df_scaffold_NGxx
=
df_scaffold_NGxx
.
reset_index
()
df_scaffold_NGxx
=
df_scaffold_NGxx
.
reset_index
()
# print(df_scaffold_NGxx)
# print(df_scaffold_NGxx)
df_scaffold_N_NG
=
pd
.
concat
([
df_scaffold_Nxx
,
df_scaffold_NGxx
],
axis
=
1
)
df_scaffold_N_NG
=
pd
.
concat
([
df_scaffold_Nxx
,
df_scaffold_NGxx
],
axis
=
1
)
df_scaffold_N_NG
=
df_scaffold_N_NG
.
drop
(
df_scaffold_N_NG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_scaffold_N_NG
=
df_scaffold_N_NG
.
drop
(
df_scaffold_N_NG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_scaffold_N_NG
[
'
Length of Nxx Scaffold (bp)
'
]
=
df_scaffold_N_NG
[
'
Length of Nxx Scaffold (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_N_NG
[
'
Length of Nxx Scaffold (bp)
'
]
=
df_scaffold_N_NG
[
'
Length of Nxx Scaffold (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_N_NG
[
'
Length of NGxx Scaffold (bp)
'
]
=
df_scaffold_N_NG
[
'
Length of NGxx Scaffold (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_N_NG
[
'
Length of NGxx Scaffold (bp)
'
]
=
df_scaffold_N_NG
[
'
Length of NGxx Scaffold (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
# df_scaffold_N_NG=df_scaffold_N_NG.style.hide_index()
# df_scaffold_N_NG=df_scaffold_N_NG.style.hide_index()
df_scaffold_Lxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
Lxx Level (%)
'
,
'
Count of scaffolds (for Lxx Level)
'
])
df_scaffold_Lxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
Lxx Level (%)
'
,
'
Count of scaffolds (for Lxx Level)
'
])
df_scaffold_Lxx
=
df_scaffold_Lxx
.
iloc
[
7
:
31
,
0
:
2
]
df_scaffold_Lxx
=
df_scaffold_Lxx
.
iloc
[
7
:
31
,
0
:
2
]
df_scaffold_Lxx
=
df_scaffold_Lxx
.
reset_index
()
df_scaffold_Lxx
=
df_scaffold_Lxx
.
reset_index
()
# print(df_scaffold_Nxx)
# print(df_scaffold_Nxx)
df_scaffold_LGxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
LGxx Level (%)
'
,
'
Count of scaffolds (for LGxx Level)
'
])
df_scaffold_LGxx
=
pd
.
DataFrame
(
scaffold_stats
.
items
(),
columns
=
[
'
LGxx Level (%)
'
,
'
Count of scaffolds (for LGxx Level)
'
])
df_scaffold_LGxx
=
df_scaffold_LGxx
.
iloc
[
55
:
79
,
0
:
2
]
df_scaffold_LGxx
=
df_scaffold_LGxx
.
iloc
[
55
:
79
,
0
:
2
]
df_scaffold_LGxx
=
df_scaffold_LGxx
.
reset_index
()
df_scaffold_LGxx
=
df_scaffold_LGxx
.
reset_index
()
# print(df_scaffold_NGxx)
# print(df_scaffold_NGxx)
df_scaffold_L_LG
=
pd
.
concat
([
df_scaffold_Lxx
,
df_scaffold_LGxx
],
axis
=
1
)
df_scaffold_L_LG
=
pd
.
concat
([
df_scaffold_Lxx
,
df_scaffold_LGxx
],
axis
=
1
)
df_scaffold_L_LG
=
df_scaffold_L_LG
.
drop
(
df_scaffold_L_LG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_scaffold_L_LG
=
df_scaffold_L_LG
.
drop
(
df_scaffold_L_LG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_scaffold_L_LG
[
'
Count of scaffolds (for Lxx Level)
'
]
=
df_scaffold_L_LG
[
'
Count of scaffolds (for Lxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_L_LG
[
'
Count of scaffolds (for Lxx Level)
'
]
=
df_scaffold_L_LG
[
'
Count of scaffolds (for Lxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_L_LG
[
'
Count of scaffolds (for LGxx Level)
'
]
=
df_scaffold_L_LG
[
'
Count of scaffolds (for LGxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_scaffold_L_LG
[
'
Count of scaffolds (for LGxx Level)
'
]
=
df_scaffold_L_LG
[
'
Count of scaffolds (for LGxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
################################################################################################################
################################################################################################################
contig_seq
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
Assembly:
'
,
naming
])
contig_seq
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
Assembly:
'
,
naming
])
df_contig_top
=
contig_seq
.
iloc
[
0
:
7
,
0
:
2
]
df_contig_top
=
contig_seq
.
iloc
[
0
:
7
,
0
:
2
]
df_contig_top
[
naming
]
=
df_contig_top
[
naming
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_top
[
naming
]
=
df_contig_top
[
naming
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
# df_scaffold_top=df_scaffold_top.style.hide_index()
# df_scaffold_top=df_scaffold_top.style.hide_index()
# df_scaffold_top[naming].round(decimals=0)
# df_scaffold_top[naming].round(decimals=0)
# df_contig_all=pd.DataFrame(data=contig_stats)
# df_contig_all=pd.DataFrame(data=contig_stats)
# df_contig_top=df_contig_all.iloc[0:6,0:2]
# df_contig_top=df_contig_all.iloc[0:6,0:2]
df_contig_Nxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
Nxx Level (%)
'
,
'
Length of Nxx contig (bp)
'
])
df_contig_Nxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
Nxx Level (%)
'
,
'
Length of Nxx contig (bp)
'
])
df_contig_Nxx
=
df_contig_Nxx
.
iloc
[
31
:
55
,
0
:
2
]
df_contig_Nxx
=
df_contig_Nxx
.
iloc
[
31
:
55
,
0
:
2
]
df_contig_Nxx
=
df_contig_Nxx
.
reset_index
()
df_contig_Nxx
=
df_contig_Nxx
.
reset_index
()
# print(df_scaffold_Nxx)
# print(df_scaffold_Nxx)
df_contig_NGxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
NGxx Level (%)
'
,
'
Length of NGxx contig (bp)
'
])
df_contig_NGxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
NGxx Level (%)
'
,
'
Length of NGxx contig (bp)
'
])
df_contig_NGxx
=
df_contig_NGxx
.
iloc
[
79
:
104
,
0
:
2
]
df_contig_NGxx
=
df_contig_NGxx
.
iloc
[
79
:
104
,
0
:
2
]
df_contig_NGxx
=
df_contig_NGxx
.
reset_index
()
df_contig_NGxx
=
df_contig_NGxx
.
reset_index
()
#
print(df_scaffold_NGxx)
# print(df_scaffold_NGxx)
df_contig_N_NG
=
pd
.
concat
([
df_contig_Nxx
,
df_contig_NGxx
],
axis
=
1
)
df_contig_N_NG
=
pd
.
concat
([
df_contig_Nxx
,
df_contig_NGxx
],
axis
=
1
)
df_contig_N_NG
=
df_contig_N_NG
.
drop
(
df_contig_N_NG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_contig_N_NG
=
df_contig_N_NG
.
drop
(
df_contig_N_NG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_contig_N_NG
[
'
Length of Nxx contig (bp)
'
]
=
df_contig_N_NG
[
'
Length of Nxx contig (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_N_NG
[
'
Length of Nxx contig (bp)
'
]
=
df_contig_N_NG
[
'
Length of Nxx contig (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_N_NG
[
'
Length of NGxx contig (bp)
'
]
=
df_contig_N_NG
[
'
Length of NGxx contig (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_N_NG
[
'
Length of NGxx contig (bp)
'
]
=
df_contig_N_NG
[
'
Length of NGxx contig (bp)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
# df_scaffold_N_NG=df_scaffold_N_NG.style.hide_index()
# df_scaffold_N_NG=df_scaffold_N_NG.style.hide_index()
df_contig_Lxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
Lxx Level (%)
'
,
'
Count of contig (for Lxx Level)
'
])
df_contig_Lxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
Lxx Level (%)
'
,
'
Count of contig (for Lxx Level)
'
])
df_contig_Lxx
=
df_contig_Lxx
.
iloc
[
7
:
31
,
0
:
2
]
df_contig_Lxx
=
df_contig_Lxx
.
iloc
[
7
:
31
,
0
:
2
]
df_contig_Lxx
=
df_contig_Lxx
.
reset_index
()
df_contig_Lxx
=
df_contig_Lxx
.
reset_index
()
# print(df_scaffold_Nxx)
# print(df_scaffold_Nxx)
df_contig_LGxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
LGxx Level (%)
'
,
'
Count of contig (for LGxx Level)
'
])
df_contig_LGxx
=
pd
.
DataFrame
(
contig_stats
.
items
(),
columns
=
[
'
LGxx Level (%)
'
,
'
Count of contig (for LGxx Level)
'
])
df_contig_LGxx
=
df_contig_LGxx
.
iloc
[
55
:
79
,
0
:
2
]
df_contig_LGxx
=
df_contig_LGxx
.
iloc
[
55
:
79
,
0
:
2
]
df_contig_LGxx
=
df_contig_LGxx
.
reset_index
()
df_contig_LGxx
=
df_contig_LGxx
.
reset_index
()
# print(df_scaffold_NGxx)
# print(df_scaffold_NGxx)
df_contig_L_LG
=
pd
.
concat
([
df_contig_Lxx
,
df_contig_LGxx
],
axis
=
1
)
df_contig_L_LG
=
pd
.
concat
([
df_contig_Lxx
,
df_contig_LGxx
],
axis
=
1
)
df_contig_L_LG
=
df_contig_L_LG
.
drop
(
df_contig_L_LG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_contig_L_LG
=
df_contig_L_LG
.
drop
(
df_contig_L_LG
.
columns
[[
0
,
3
]],
axis
=
1
)
df_contig_L_LG
[
'
Count of contig (for Lxx Level)
'
]
=
df_contig_L_LG
[
'
Count of contig (for Lxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_L_LG
[
'
Count of contig (for Lxx Level)
'
]
=
df_contig_L_LG
[
'
Count of contig (for Lxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_L_LG
[
'
Count of contig (for LGxx Level)
'
]
=
df_contig_L_LG
[
'
Count of contig (for LGxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
df_contig_L_LG
[
'
Count of contig (for LGxx Level)
'
]
=
df_contig_L_LG
[
'
Count of contig (for LGxx Level)
'
].
astype
(
'
int64
'
).
apply
(
'
{:,}
'
.
format
)
# df_scaffold_L_LG=df_scaffold_L_LG.style.hide_index()
# df_scaffold_L_LG=df_scaffold_L_LG.style.hide_index()
# print(df_contig_top)
# print(df_contig_top)
# print(scaffold_stats)
# print(scaffold_stats)
...
@@ -286,27 +288,27 @@ if __name__ == "__main__":
...
@@ -286,27 +288,27 @@ if __name__ == "__main__":
# contig = csv.writer(open(naming + "_contig_stats.tsv", "w"), delimiter='\t')
# contig = csv.writer(open(naming + "_contig_stats.tsv", "w"), delimiter='\t')
# for key, val in contig_stats.items():
# for key, val in contig_stats.items():
#
#
with
open
(
sys
.
argv
[
5
],
'
w
'
)
as
outputfile
:
with
open
(
sys
.
argv
[
5
],
'
w
'
)
as
outputfile
:
# # print('#' + libraryName, file=outputfile)
# # print('#' + libraryName, file=outputfile)
# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile)
# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile)
# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile)
# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile)
# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile)
# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile)
print
(
df_scaffold_top
.
to_string
(
index
=
False
),
file
=
outputfile
)
print
(
df_scaffold_top
.
to_string
(
index
=
False
),
file
=
outputfile
)
print
(
""
,
file
=
outputfile
)
print
(
""
,
file
=
outputfile
)
print
(
df_scaffold_N_NG
.
to_string
(
index
=
False
),
file
=
outputfile
)
print
(
df_scaffold_N_NG
.
to_string
(
index
=
False
),
file
=
outputfile
)
print
(
""
,
file
=
outputfile
)
print
(
""
,
file
=
outputfile
)
print
(
df_scaffold_L_LG
.
to_string
(
index
=
False
),
file
=
outputfile
)
print
(
df_scaffold_L_LG
.
to_string
(
index
=
False
),
file
=
outputfile
)
with
open
(
sys
.
argv
[
6
],
'
w
'
)
as
outputfile2
:
with
open
(
sys
.
argv
[
6
],
'
w
'
)
as
outputfile2
:
# # print('#' + libraryName, file=outputfile)
# # print('#' + libraryName, file=outputfile)
# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile)
# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile)
# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile)
# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile)
# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile)
# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile)
print
(
df_contig_top
.
to_string
(
index
=
False
),
file
=
outputfile2
)
print
(
df_contig_top
.
to_string
(
index
=
False
),
file
=
outputfile2
)
print
(
""
,
file
=
outputfile2
)
print
(
""
,
file
=
outputfile2
)
print
(
df_contig_N_NG
.
to_string
(
index
=
False
),
file
=
outputfile2
)
print
(
df_contig_N_NG
.
to_string
(
index
=
False
),
file
=
outputfile2
)
print
(
""
,
file
=
outputfile2
)
print
(
""
,
file
=
outputfile2
)
print
(
df_contig_L_LG
.
to_string
(
index
=
False
),
file
=
outputfile2
)
print
(
df_contig_L_LG
.
to_string
(
index
=
False
),
file
=
outputfile2
)
# with open(sys.argv[4], 'w') as outputRst:
# with open(sys.argv[4], 'w') as outputRst:
# print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
# print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
...
@@ -324,21 +326,21 @@ if __name__ == "__main__":
...
@@ -324,21 +326,21 @@ if __name__ == "__main__":
# print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
# print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
# print("", file=outputRst2)
# print("", file=outputRst2)
with
open
(
sys
.
argv
[
4
],
'
w
'
)
as
outputRst
:
with
open
(
sys
.
argv
[
4
],
'
w
'
)
as
outputRst
:
print
(
tabulate
(
df_scaffold_top
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst
)
print
(
tabulate
(
df_scaffold_top
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst
)
print
(
""
,
file
=
outputRst
)
print
(
""
,
file
=
outputRst
)
print
(
tabulate
(
df_scaffold_N_NG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst
)
print
(
tabulate
(
df_scaffold_N_NG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst
)
print
(
""
,
file
=
outputRst
)
print
(
""
,
file
=
outputRst
)
print
(
tabulate
(
df_scaffold_L_LG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst
)
print
(
tabulate
(
df_scaffold_L_LG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst
)
print
(
""
,
file
=
outputRst
)
print
(
""
,
file
=
outputRst
)
#
#
with
open
(
sys
.
argv
[
5
],
'
w
'
)
as
outputRst2
:
with
open
(
sys
.
argv
[
5
],
'
w
'
)
as
outputRst2
:
print
(
tabulate
(
df_contig_top
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst2
)
print
(
tabulate
(
df_contig_top
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst2
)
print
(
""
,
file
=
outputRst2
)
print
(
""
,
file
=
outputRst2
)
print
(
tabulate
(
df_contig_N_NG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst2
)
print
(
tabulate
(
df_contig_N_NG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst2
)
print
(
""
,
file
=
outputRst2
)
print
(
""
,
file
=
outputRst2
)
print
(
tabulate
(
df_contig_L_LG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst2
)
print
(
tabulate
(
df_contig_L_LG
,
headers
=
'
keys
'
,
tablefmt
=
"
pipe
"
,
showindex
=
False
),
file
=
outputRst2
)
print
(
""
,
file
=
outputRst2
)
print
(
""
,
file
=
outputRst2
)
# list_of_dfs=[df_scaffold_top,df_scaffold_N_NG,df_scaffold_L_LG]
# list_of_dfs=[df_scaffold_top,df_scaffold_N_NG,df_scaffold_L_LG]
# for df in list_of_dfs:
# for df in list_of_dfs:
# with open('all_dfs.tsv','a') as f:
# with open('all_dfs.tsv','a') as f:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment