Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ You should also add project tags for each release in Github, see [Managing relea

### Removed
- GitHub action to run flake8 for linting in build

- Removed wildcard from corpus-counter script dependency

## [2.0.0] - 2024-05-29
### Added
Expand Down
11 changes: 6 additions & 5 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@ schema: '2.0'
stages:
count-words:
cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv
data/gutenberg/*.txt --case-insensitive
data/gutenberg --case-insensitive
deps:
- path: data/gutenberg
md5: 41d960155f1a7f55480c03cea68ba2a7.dir
size: 10940
nfiles: 11
- path: src/cdstemplate/corpus_counter_script.py
hash: md5
md5: a4bb400c0cfd7050ac4b761b550a0a56
size: 2582
md5: 0db1baa08c580811414d608bd98bd7d9
size: 2596
outs:
- path: data/gutenberg_counts.csv
md5: 74abc508b4e4015ab4136405df251a57
size: 4922
hash: md5
md5: 346733ebed78882d3b9871db6ef32043
size: 4519
2 changes: 1 addition & 1 deletion dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
stages:
count-words:
cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive
cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg --case-insensitive
deps:
- src/cdstemplate/corpus_counter_script.py
- data/gutenberg
Expand Down
2 changes: 1 addition & 1 deletion notebooks/word_count_prototype.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
"version": "3.13.3"
},
"orig_nbformat": 4
},
Expand Down
15 changes: 8 additions & 7 deletions src/cdstemplate/corpus_counter_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def main_cli():
parser.add_argument("csv", help="Path to the output CSV storing token counts. Required.")

parser.add_argument(
"documents",
nargs="+",
help="Paths to at least one raw text document that make up the corpus. Required.",
"document_dir",
type=Path,
help="Path to folder containing raw .txt documents that make up the corpus. Required.",
)
parser.add_argument(
"--case-insensitive",
Expand All @@ -38,19 +38,20 @@ def main_cli():
args = parser.parse_args()
utils.configure_logging()
logger.info("Command line arguments: %s", args)
main(args.csv, args.documents, args.case_insensitive)
main(args.csv, args.document_dir, args.case_insensitive)


def main(csv_out, documents, case_insensitive=False):
def main(csv_out, document_dir, case_insensitive=False):
"""Determine cumulative word counts for a list of documents and write the results to a CSV file

:param csv_out: output CSV file path
:type csv_out: str or Path
:param documents: list of paths to documents to parse word counts from
:type documents: list of str
:param document_dir: Path to folder containing .txt files
:type document_dir: Path
:param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False
:type case_insensitive: bool, optional
"""
documents = Path(document_dir).glob("*.txt")
cc = word_count.CorpusCounter(case_insensitive=case_insensitive)
for i, doc in enumerate(documents):
if i % 2 == 0:
Expand Down
2 changes: 1 addition & 1 deletion src/cdstemplate/word_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_token_counts_as_dataframe(self):
return dataframe

def save_token_counts(self, csv_file):
"""Saves the counts of tokens the corpus to a specified
"""Saves the counts of tokens from the corpus to a specified
CSV file in alphabetical order

:param csv_file: Path to desired CSV output file
Expand Down