]> code.communitydata.science - rises_declines_wikia_code.git/blob - mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py
Initial commit
[rises_declines_wikia_code.git] / mediawiki_dump_tools / Mediawiki-Utilities / mw / lib / persistence / tokenization.py
1 import re
2
3
4 def wikitext_split(text):
5     """
6     Performs the simplest possible split of latin character-based languages
7     and wikitext.
8
9     :Parameters:
10         text : str
11             Text to split.
12     """
13     return re.findall(
14         r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
15         text
16     )

Community Data Science Collective || Want to submit a patch?