Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package Registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Commits
2afec4ec
Commit
2afec4ec
authored
1 year ago
by
Yoann Schneider
Browse files
Options
Downloads
Plain Diff
Merge branch 'convert-to-BIO' into 'main'
Convert NER prediction to BIO format Closes
#229
See merge request
!325
parents
65128721
fc146a8d
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!325
Convert NER prediction to BIO format
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dan/bio.py
+85
-0
85 additions, 0 deletions
dan/bio.py
tests/test_bio.py
+120
-0
120 additions, 0 deletions
tests/test_bio.py
with
205 additions
and
0 deletions
dan/bio.py
0 → 100644
+
85
−
0
View file @
2afec4ec
import
logging
import
re
from
typing
import
Dict
,
List
from
dan.utils
import
EntityType
logger
=
logging
.
getLogger
(
__name__
)
def
convert
(
text
:
str
,
ner_tokens
:
Dict
[
str
,
EntityType
])
->
str
:
# Mapping to find a starting token for an ending token efficiently
mapping_end_start
:
Dict
[
str
,
str
]
=
{
entity_type
.
end
:
entity_type
.
start
for
entity_type
in
ner_tokens
.
values
()
}
# Mapping to find the entity name for a starting token efficiently
mapping_start_name
:
Dict
[
str
,
str
]
=
{
entity_type
.
start
:
name
for
name
,
entity_type
in
ner_tokens
.
items
()
}
starting_tokens
:
List
[
str
]
=
mapping_start_name
.
keys
()
ending_tokens
:
List
[
str
]
=
mapping_end_start
.
keys
()
has_ending_tokens
:
bool
=
set
(
ending_tokens
)
!=
{
""
}
# Whether ending tokens are used
# Spacing starting tokens and ending tokens (if necessary)
tokens_spacing
:
re
.
Pattern
=
re
.
compile
(
r
"
([
"
+
""
.
join
([
*
starting_tokens
,
*
ending_tokens
])
+
"
])
"
)
text
:
str
=
tokens_spacing
.
sub
(
r
"
\1
"
,
text
)
iob
:
List
[
str
]
=
[]
# List of IOB formatted strings
entity_types
:
List
[
str
]
=
[]
# Encountered entity types
inside
:
bool
=
False
# Whether we are inside an entity
for
token
in
text
.
split
():
# Encountering a starting token
if
token
in
starting_tokens
:
entity_types
.
append
(
token
)
# Stopping any current entity type
inside
=
False
continue
# Encountering an ending token
elif
has_ending_tokens
and
token
in
ending_tokens
:
if
not
entity_types
:
logger
.
warning
(
f
"
Missing starting token for ending token
{
token
}
, skipping the entity
"
)
continue
# Making sure this ending token closes the current entity
assert
(
entity_types
[
-
1
]
==
mapping_end_start
[
token
]
),
f
"
Ending token
{
token
}
doesn
'
t match the starting token
{
entity_types
[
-
1
]
}
"
# Removing the current entity from the queue as it is its end
entity_types
.
pop
()
# If there is still entities in the queue, we continue in the parent one
# Else, we are not in any entity anymore
inside
=
bool
(
entity_types
)
continue
# The token is not part of an entity
if
not
entity_types
:
iob
.
append
(
f
"
{
token
}
O
"
)
continue
# The token is part of at least one entity
entity_name
:
str
=
mapping_start_name
[
entity_types
[
-
1
]]
if
inside
:
# Inside the same entity
iob
.
append
(
f
"
{
token
}
I-
{
entity_name
}
"
)
continue
# Starting a new entity
iob
.
append
(
f
"
{
token
}
B-
{
entity_name
}
"
)
inside
=
True
# Concatenating all formatted iob strings
return
"
\n
"
.
join
(
iob
)
This diff is collapsed.
Click to expand it.
tests/test_bio.py
0 → 100644
+
120
−
0
View file @
2afec4ec
import
logging
import
pytest
from
dan.bio
import
convert
from
dan.utils
import
EntityType
ST_TEXT
=
"""
ⒶBryan B ⒷParis ⒸJanuary 1st, 1987
ⒶJoe J ⒷGrenoble ⒸAugust 24, 1995
ⒶHannah H ⒷLille ⒸSeptember 15, 2002
"""
ST_ET_TEXT
=
"""
ⒶBryanⒷ and ⒶJoeⒷ will visit the ⒸEiffel TowerⒹ in ⒸParisⒹ next ⒺTuesdayⒻ.
ⒶHannahⒷ will visit the ⒸPlace ⒶCharles de GaulleⒷ étoileⒹ on ⒺWednesdayⒻ.
"""
def
test_convert_with_error
():
ner_tokens
=
{
"
Person
"
:
EntityType
(
start
=
"
Ⓐ
"
,
end
=
"
Ⓑ
"
),
"
Location
"
:
EntityType
(
start
=
"
Ⓒ
"
,
end
=
"
Ⓓ
"
),
}
with
pytest
.
raises
(
AssertionError
,
match
=
"
Ending token Ⓓ doesn
'
t match the starting token Ⓐ
"
):
convert
(
"
ⒶFredⒹ
"
,
ner_tokens
)
def
test_convert_with_warnings
(
caplog
):
ner_tokens
=
{
"
Person
"
:
EntityType
(
start
=
"
Ⓐ
"
,
end
=
"
Ⓑ
"
),
"
Location
"
:
EntityType
(
start
=
"
Ⓒ
"
,
end
=
"
Ⓓ
"
),
}
assert
convert
(
"
BryanⒷ and ⒶJoeⒷ will visit the Eiffel TowerⒹ
"
,
ner_tokens
).
split
(
"
\n
"
)
==
[
"
Bryan O
"
,
"
and O
"
,
"
Joe B-Person
"
,
"
will O
"
,
"
visit O
"
,
"
the O
"
,
"
Eiffel O
"
,
"
Tower O
"
,
]
assert
[(
level
,
message
)
for
_
,
level
,
message
in
caplog
.
record_tuples
]
==
[
(
logging
.
WARNING
,
"
Missing starting token for ending token Ⓑ, skipping the entity
"
,
),
(
logging
.
WARNING
,
"
Missing starting token for ending token Ⓓ, skipping the entity
"
,
),
]
def
test_convert_starting_tokens
():
ner_tokens
=
{
"
Person
"
:
EntityType
(
start
=
"
Ⓐ
"
),
"
Location
"
:
EntityType
(
start
=
"
Ⓑ
"
),
"
Date
"
:
EntityType
(
start
=
"
Ⓒ
"
),
}
assert
convert
(
ST_TEXT
,
ner_tokens
).
split
(
"
\n
"
)
==
[
"
Bryan B-Person
"
,
"
B I-Person
"
,
"
Paris B-Location
"
,
"
January B-Date
"
,
"
1st, I-Date
"
,
"
1987 I-Date
"
,
"
Joe B-Person
"
,
"
J I-Person
"
,
"
Grenoble B-Location
"
,
"
August B-Date
"
,
"
24, I-Date
"
,
"
1995 I-Date
"
,
"
Hannah B-Person
"
,
"
H I-Person
"
,
"
Lille B-Location
"
,
"
September B-Date
"
,
"
15, I-Date
"
,
"
2002 I-Date
"
,
]
def
test_convert_starting_and_ending_tokens
():
ner_tokens
=
{
"
Person
"
:
EntityType
(
start
=
"
Ⓐ
"
,
end
=
"
Ⓑ
"
),
"
Location
"
:
EntityType
(
start
=
"
Ⓒ
"
,
end
=
"
Ⓓ
"
),
"
Date
"
:
EntityType
(
start
=
"
Ⓔ
"
,
end
=
"
Ⓕ
"
),
}
assert
convert
(
ST_ET_TEXT
,
ner_tokens
).
split
(
"
\n
"
)
==
[
"
Bryan B-Person
"
,
"
and O
"
,
"
Joe B-Person
"
,
"
will O
"
,
"
visit O
"
,
"
the O
"
,
"
Eiffel B-Location
"
,
"
Tower I-Location
"
,
"
in O
"
,
"
Paris B-Location
"
,
"
next O
"
,
"
Tuesday B-Date
"
,
"
. O
"
,
"
Hannah B-Person
"
,
"
will O
"
,
"
visit O
"
,
"
the O
"
,
"
Place B-Location
"
,
"
Charles B-Person
"
,
"
de I-Person
"
,
"
Gaulle I-Person
"
,
"
étoile I-Location
"
,
"
on O
"
,
"
Wednesday B-Date
"
,
"
. O
"
,
]
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment