diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 5b0731c4..48736ed6 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; namespace SIL.Machine.Corpora @@ -22,6 +23,7 @@ ParatextProjectSettings settings public string UpdateUsfm( string bookId, IReadOnlyList rows, + IReadOnlyList chapters = null, string fullName = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, @@ -29,7 +31,7 @@ public string UpdateUsfm( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, - IEnumerable remarks = null, + IEnumerable<(int, string)> remarks = null, Func errorHandler = null, bool compareSegments = false ) @@ -59,7 +61,11 @@ public string UpdateUsfm( ); try { - UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + var tokenizer = new UsfmTokenizer(_settings.Stylesheet); + IReadOnlyList tokens = tokenizer.Tokenize(usfm); + tokens = FilterTokensByChapter(tokens, chapters); + var parser = new UsfmParser(tokens, handler, _settings.Stylesheet, _settings.Versification); + parser.ProcessTokens(); return handler.GetUsfm(_settings.Stylesheet); } catch (Exception ex) @@ -73,6 +79,53 @@ public string UpdateUsfm( } } + /// + /// Filters tokens by the specified chapters. + /// + /// The tokens. + /// The chapters. If null, all tokens are returned. + /// The filtered tokens. + /// This is marked internal so test classes can use it. + internal static IReadOnlyList FilterTokensByChapter( + IReadOnlyList tokens, + IReadOnlyList chapters = null + ) + { + if (chapters is null) + return tokens; + + var tokensWithinChapters = new List(); + bool inChapter = false; + bool inIdMarker = false; + + for (int index = 0; index < tokens.Count; index++) + { + UsfmToken token = tokens[index]; + if (index == 0 && token.Marker == "id") + { + inIdMarker = true; + if (chapters.Contains(1)) + inChapter = true; + } + else if (inIdMarker && token.Marker != null && token.Marker != "id") + { + inIdMarker = false; + } + else if (token.Type == UsfmTokenType.Chapter) + { + inChapter = + !string.IsNullOrEmpty(token.Data) + && int.TryParse(token.Data, out int chapter) + && chapters.Contains(chapter); + } + + if (inIdMarker || inChapter) + tokensWithinChapters.Add(token); + } + + return tokensWithinChapters; + } + private bool Exists(string fileName) => _paratextProjectFileHandler.Exists(fileName); private Stream Open(string fileName) => _paratextProjectFileHandler.Open(fileName); diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 76a59336..a5687439 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -60,7 +60,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private readonly HashSet _preserveParagraphStyles; private readonly Stack _updateBlocks; private readonly Stack _updateBlockHandlers; - private readonly List _remarks; + private readonly List<(int, string)> _remarks; private readonly Stack _replace; private int _tokenIndex; private readonly Func _errorHandler; @@ -76,7 +76,7 @@ public UpdateUsfmParserHandler( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, - IEnumerable remarks = null, + IEnumerable<(int, string)> remarks = null, Func errorHandler = null, bool compareSegments = false ) @@ -107,7 +107,7 @@ public UpdateUsfmParserHandler( preserveParagraphStyles == null ? new HashSet { "r", "rem" } : new HashSet(preserveParagraphStyles); - _remarks = remarks?.ToList() ?? new List(); + _remarks = remarks?.ToList() ?? new List<(int, string)>(); _errorHandler = errorHandler; if (_errorHandler == null) _errorHandler = (error) => false; @@ -433,26 +433,66 @@ public string GetUsfm(string stylesheetFileName = "usfm.sty") public string GetUsfm(UsfmStylesheet stylesheet) { var tokenizer = new UsfmTokenizer(stylesheet); - List tokens = new List(_tokens); - if (_remarks.Count() > 0) + var tokens = new List(_tokens); + if (_remarks.Count > 0) { - var remarkTokens = new List(); - foreach (string remark in _remarks) + var remarkTokensByChapter = new Dictionary>(); + foreach ((int chapterNum, string remark) in _remarks) { - remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); - remarkTokens.Add(new UsfmToken(remark)); + // Add the remark tokens for each chapter that is to have remarks + if (!remarkTokensByChapter.TryGetValue(chapterNum, out List chapterTokens)) + { + chapterTokens = new List(); + remarkTokensByChapter.Add(chapterNum, chapterTokens); + } + + chapterTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); + chapterTokens.Add(new UsfmToken(remark)); } if (tokens.Count > 0) { - int index = 0; - HashSet markersToSkip = new HashSet() { "id", "ide", "rem" }; - while (markersToSkip.Contains(tokens[index].Marker)) + foreach (KeyValuePair> remarkTokens in remarkTokensByChapter) { - index++; - if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text) + int index; + HashSet markersToSkip; + if (remarkTokens.Key == 0) + { + // Add the remarks at the top level of the USFM, + // after the book id, encode, and any initial comments + index = 0; + markersToSkip = new HashSet { "id", "ide", "rem" }; + } + else + { + // Add the remarks just after the specified chapter + index = tokens.FindIndex(t => + t.Type == UsfmTokenType.Chapter + && int.TryParse(t.Data, out int chapterNumber) + && chapterNumber == remarkTokens.Key + ); + if (index == -1) + continue; index++; + markersToSkip = new HashSet { "rem" }; + } + + if (index >= tokens.Count) + { + // The remark insertion point is at the very end + tokens.AddRange(remarkTokens.Value); + } + else + { + while (markersToSkip.Contains(tokens[index].Marker)) + { + index++; + if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text) + index++; + } + + tokens.InsertRange(index, remarkTokens.Value); + } } - tokens.InsertRange(index, remarkTokens); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 9b5219c3..177e6e2c 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -1380,13 +1380,14 @@ public void GetUsfm_IdTags() } [Test] - public void GetUsfm_PreferExisting_AddRemark() + public void GetUsfm_PassRemark() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), }; + string usfm = @"\id MAT - Test \ide UTF-8 @@ -1395,40 +1396,114 @@ public void GetUsfm_PreferExisting_AddRemark() \v 1 Some text \v 2 \v 3 Other text +\c 2 +\rem Existing remark +\v 1 More text +\c 3 "; + string target = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.PreferExisting, - remarks: ["New remark"] + remarks: [(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2"), (3, "New remark 3")] ); + string result = @"\id MAT - Test \ide UTF-8 \rem Existing remark -\rem New remark +\rem New remark 0 \c 1 +\rem New remark 1 \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem Existing remark +\rem New remark 2 +\v 1 More text +\c 3 +\rem New remark 3 "; AssertUsfmEquals(target, result); + } - target = UpdateUsfm( + [Test] + public void GetUsfm_PassRemark0_NoExistingRemark() + { + var rows = new List + { + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), + }; + + string usfm = + @"\id MAT - Test +\ide UTF-8 +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +"; + + string target = UpdateUsfm( rows, - target, + usfm, textBehavior: UpdateUsfmTextBehavior.PreferExisting, - remarks: ["New remark 2"] + remarks: [(0, "New remark 0")] ); - result = + + string result = + @"\id MAT - Test +\ide UTF-8 +\rem New remark 0 +\c 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void GetUsfm_MultipleRemarksSameChapter() + { + var rows = new List + { + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), + }; + + string usfm = @"\id MAT - Test \ide UTF-8 \rem Existing remark -\rem New remark -\rem New remark 2 \c 1 \v 1 Some text +\v 2 +\v 3 Other text +"; + + string target = UpdateUsfm( + rows, + usfm, + textBehavior: UpdateUsfmTextBehavior.PreferExisting, + remarks: [(0, "New remark 0.1"), (0, "New remark 0.2"), (1, "New remark 1.1"), (1, "New remark 1.2")] + ); + + string result = + @"\id MAT - Test +\ide UTF-8 +\rem Existing remark +\rem New remark 0.1 +\rem New remark 0.2 +\c 1 +\rem New remark 1.1 +\rem New remark 1.2 +\v 1 Some text \v 2 Update 2 \v 3 Other text "; @@ -1521,6 +1596,97 @@ public void UpdateBlock_FootnoteAtStartOfChapterWithPrecedingText() ); } + [Test] + public void FilterChapters() + { + string usfm = + @"\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +"; + + string target = UpdateUsfm(source: usfm, chapters: [2, 4]); + + string result = + @"\id MAT - Test +\c 2 +\v 1 Some text +\c 4 +\v 1 Some text +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void FilterChapters_WithChapterOneAndHeader() + { + string usfm = + @"\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +"; + + string target = UpdateUsfm(source: usfm, chapters: [1, 3]); + + string result = + @"\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 3 +\v 1 Some text +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void FilterChapters_WithBadChapterReference() + { + string usfm = + @"\id MAT - Test +\c 1. +\v 1 Some text +\c 2. +\v 1 Some text +\c 3 +\v 1 Some text with good chapter reference +\c 4 +\v 1 Some text with good chapter reference +"; + + string target = UpdateUsfm(source: usfm, chapters: [2, 4]); + + string result = + @"\id MAT - Test +\c 4 +\v 1 Some text with good chapter reference +"; + + AssertUsfmEquals(target, result); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); @@ -1529,6 +1695,7 @@ private static ScriptureRef[] ScrRef(params string[] refs) private static string UpdateUsfm( IReadOnlyList? rows = null, string? source = null, + IReadOnlyList? chapters = null, string? idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, @@ -1536,7 +1703,7 @@ private static string UpdateUsfm( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable? preserveParagraphStyles = null, IEnumerable? usfmUpdateBlockHandlers = null, - IEnumerable? remarks = null, + IEnumerable<(int, string)>? remarks = null, bool compareSegments = false ) { @@ -1546,6 +1713,7 @@ private static string UpdateUsfm( return updater.UpdateUsfm( "MAT", rows, + chapters, idText, textBehavior, paragraphBehavior, @@ -1574,7 +1742,11 @@ private static string UpdateUsfm( (_) => false, compareSegments ); - UsfmParser.Parse(source, updater); + var tokenizer = new UsfmTokenizer(); + IReadOnlyList tokens = tokenizer.Tokenize(source); + tokens = ParatextProjectTextUpdaterBase.FilterTokensByChapter(tokens, chapters); + var parser = new UsfmParser(tokens, updater); + parser.ProcessTokens(); return updater.GetUsfm(); } }