From 88a19e82ee855d7ef2af5bbcd6c434f1c53ff92e Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Wed, 4 Jun 2025 11:32:31 -0400 Subject: [PATCH 1/2] Got rid of wildcard in documents path & fixed some typos --- dvc.lock | 11 ++++++----- dvc.yaml | 2 +- notebooks/word_count_prototype.ipynb | 2 +- src/cdstemplate/corpus_counter_script.py | 15 ++++++++------- src/cdstemplate/word_count.py | 2 +- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/dvc.lock b/dvc.lock index aa51e59..22d096e 100644 --- a/dvc.lock +++ b/dvc.lock @@ -2,7 +2,7 @@ schema: '2.0' stages: count-words: cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv - data/gutenberg/*.txt --case-insensitive + data/gutenberg --case-insensitive deps: - path: data/gutenberg md5: 41d960155f1a7f55480c03cea68ba2a7.dir @@ -10,9 +10,10 @@ stages: nfiles: 11 - path: src/cdstemplate/corpus_counter_script.py hash: md5 - md5: a4bb400c0cfd7050ac4b761b550a0a56 - size: 2582 + md5: 0db1baa08c580811414d608bd98bd7d9 + size: 2596 outs: - path: data/gutenberg_counts.csv - md5: 74abc508b4e4015ab4136405df251a57 - size: 4922 + hash: md5 + md5: 346733ebed78882d3b9871db6ef32043 + size: 4519 diff --git a/dvc.yaml b/dvc.yaml index ed00753..d02a37c 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,6 +1,6 @@ stages: count-words: - cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive + cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg --case-insensitive deps: - src/cdstemplate/corpus_counter_script.py - data/gutenberg diff --git a/notebooks/word_count_prototype.ipynb b/notebooks/word_count_prototype.ipynb index 02f797b..48f594a 100644 --- a/notebooks/word_count_prototype.ipynb +++ b/notebooks/word_count_prototype.ipynb @@ -358,7 +358,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.13.3" }, "orig_nbformat": 4 }, diff --git a/src/cdstemplate/corpus_counter_script.py b/src/cdstemplate/corpus_counter_script.py index f0f3ae7..63f45e1 100644 --- a/src/cdstemplate/corpus_counter_script.py +++ b/src/cdstemplate/corpus_counter_script.py @@ -24,9 +24,9 @@ def main_cli(): parser.add_argument("csv", help="Path to the output CSV storing token counts. Required.") parser.add_argument( - "documents", - nargs="+", - help="Paths to at least one raw text document that make up the corpus. Required.", + "document_dir", + type=Path, + help="Path to folder containing raw .txt documents that make up the corpus. Required.", ) parser.add_argument( "--case-insensitive", @@ -38,19 +38,20 @@ def main_cli(): args = parser.parse_args() utils.configure_logging() logger.info("Command line arguments: %s", args) - main(args.csv, args.documents, args.case_insensitive) + main(args.csv, args.document_dir, args.case_insensitive) -def main(csv_out, documents, case_insensitive=False): +def main(csv_out, document_dir, case_insensitive=False): """Determine cumulative word counts for a list of documents and write the results to a CSV file :param csv_out: output CSV file path :type csv_out: str or Path - :param documents: list of paths to documents to parse word counts from - :type documents: list of str + :param document_dir: Path to folder containing .txt files + :type document_dir: Path :param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False :type case_insensitive: bool, optional """ + documents = Path(document_dir).glob("*.txt") cc = word_count.CorpusCounter(case_insensitive=case_insensitive) for i, doc in enumerate(documents): if i % 2 == 0: diff --git a/src/cdstemplate/word_count.py b/src/cdstemplate/word_count.py index ce2e561..34521a5 100644 --- a/src/cdstemplate/word_count.py +++ b/src/cdstemplate/word_count.py @@ -99,7 +99,7 @@ def get_token_counts_as_dataframe(self): return dataframe def save_token_counts(self, csv_file): - """Saves the counts of tokens the corpus to a specified + """Saves the counts of tokens from the corpus to a specified CSV file in alphabetical order :param csv_file: Path to desired CSV output file From 67127f72c7627835b2bba4f0d14325a6fa1b9e5e Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Wed, 4 Jun 2025 11:37:23 -0400 Subject: [PATCH 2/2] Changlog note about wildcard deps --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4396d07..d583675 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ You should also add project tags for each release in Github, see [Managing relea ### Removed - GitHub action to run flake8 for linting in build - +- Removed wildcard from corpus-counter script dependency ## [2.0.0] - 2024-05-29 ### Added