Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;

namespace SIL.Machine.Corpora
Expand All @@ -22,14 +23,15 @@ ParatextProjectSettings settings
public string UpdateUsfm(
string bookId,
IReadOnlyList<UpdateUsfmRow> rows,
IReadOnlyList<int> chapters = null,
string fullName = null,
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
IEnumerable<string> preserveParagraphStyles = null,
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
IEnumerable<string> remarks = null,
IEnumerable<(int, string)> remarks = null,
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
bool compareSegments = false
)
Expand Down Expand Up @@ -59,7 +61,11 @@ public string UpdateUsfm(
);
try
{
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
var tokenizer = new UsfmTokenizer(_settings.Stylesheet);
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
tokens = FilterTokensByChapter(tokens, chapters);
var parser = new UsfmParser(tokens, handler, _settings.Stylesheet, _settings.Versification);
parser.ProcessTokens();
return handler.GetUsfm(_settings.Stylesheet);
}
catch (Exception ex)
Expand All @@ -73,6 +79,53 @@ public string UpdateUsfm(
}
}

/// <summary>
/// Filters tokens by the specified chapters.
/// </summary>
/// <param name="tokens">The tokens.</param>
/// <param name="chapters">The chapters. If null, all tokens are returned.</param>
/// <returns>The filtered tokens.</returns>
/// <remarks>This is marked internal so test classes can use it.</remarks>
internal static IReadOnlyList<UsfmToken> FilterTokensByChapter(
IReadOnlyList<UsfmToken> tokens,
IReadOnlyList<int> chapters = null
)
{
if (chapters is null)
return tokens;

var tokensWithinChapters = new List<UsfmToken>();
bool inChapter = false;
bool inIdMarker = false;

for (int index = 0; index < tokens.Count; index++)
{
UsfmToken token = tokens[index];
if (index == 0 && token.Marker == "id")
{
inIdMarker = true;
if (chapters.Contains(1))
inChapter = true;
}
else if (inIdMarker && token.Marker != null && token.Marker != "id")
{
inIdMarker = false;
}
else if (token.Type == UsfmTokenType.Chapter)
{
inChapter =
!string.IsNullOrEmpty(token.Data)
&& int.TryParse(token.Data, out int chapter)
&& chapters.Contains(chapter);
}

if (inIdMarker || inChapter)
tokensWithinChapters.Add(token);
}

return tokensWithinChapters;
}

private bool Exists(string fileName) => _paratextProjectFileHandler.Exists(fileName);

private Stream Open(string fileName) => _paratextProjectFileHandler.Open(fileName);
Expand Down
70 changes: 55 additions & 15 deletions src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
private readonly HashSet<string> _preserveParagraphStyles;
private readonly Stack<UsfmUpdateBlock> _updateBlocks;
private readonly Stack<IUsfmUpdateBlockHandler> _updateBlockHandlers;
private readonly List<string> _remarks;
private readonly List<(int, string)> _remarks;
private readonly Stack<bool> _replace;
private int _tokenIndex;
private readonly Func<UsfmUpdateBlockHandlerException, bool> _errorHandler;
Expand All @@ -76,7 +76,7 @@ public UpdateUsfmParserHandler(
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
IEnumerable<string> preserveParagraphStyles = null,
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
IEnumerable<string> remarks = null,
IEnumerable<(int, string)> remarks = null,
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
bool compareSegments = false
)
Expand Down Expand Up @@ -107,7 +107,7 @@ public UpdateUsfmParserHandler(
preserveParagraphStyles == null
? new HashSet<string> { "r", "rem" }
: new HashSet<string>(preserveParagraphStyles);
_remarks = remarks?.ToList() ?? new List<string>();
_remarks = remarks?.ToList() ?? new List<(int, string)>();
_errorHandler = errorHandler;
if (_errorHandler == null)
_errorHandler = (error) => false;
Expand Down Expand Up @@ -433,26 +433,66 @@ public string GetUsfm(string stylesheetFileName = "usfm.sty")
public string GetUsfm(UsfmStylesheet stylesheet)
{
var tokenizer = new UsfmTokenizer(stylesheet);
List<UsfmToken> tokens = new List<UsfmToken>(_tokens);
if (_remarks.Count() > 0)
var tokens = new List<UsfmToken>(_tokens);
if (_remarks.Count > 0)
{
var remarkTokens = new List<UsfmToken>();
foreach (string remark in _remarks)
var remarkTokensByChapter = new Dictionary<int, List<UsfmToken>>();
foreach ((int chapterNum, string remark) in _remarks)
{
remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null));
remarkTokens.Add(new UsfmToken(remark));
// Add the remark tokens for each chapter that is to have remarks
if (!remarkTokensByChapter.TryGetValue(chapterNum, out List<UsfmToken> chapterTokens))
{
chapterTokens = new List<UsfmToken>();
remarkTokensByChapter.Add(chapterNum, chapterTokens);
}

chapterTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null));
chapterTokens.Add(new UsfmToken(remark));
}
if (tokens.Count > 0)
{
int index = 0;
HashSet<string> markersToSkip = new HashSet<string>() { "id", "ide", "rem" };
while (markersToSkip.Contains(tokens[index].Marker))
foreach (KeyValuePair<int, List<UsfmToken>> remarkTokens in remarkTokensByChapter)
{
index++;
if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text)
int index;
HashSet<string> markersToSkip;
if (remarkTokens.Key == 0)
{
// Add the remarks at the top level of the USFM,
// after the book id, encode, and any initial comments
index = 0;
markersToSkip = new HashSet<string> { "id", "ide", "rem" };
}
else
{
// Add the remarks just after the specified chapter
index = tokens.FindIndex(t =>
t.Type == UsfmTokenType.Chapter
&& int.TryParse(t.Data, out int chapterNumber)
&& chapterNumber == remarkTokens.Key
);
if (index == -1)
continue;
index++;
markersToSkip = new HashSet<string> { "rem" };
}

if (index >= tokens.Count)
{
// The remark insertion point is at the very end
tokens.AddRange(remarkTokens.Value);
}
else
{
while (markersToSkip.Contains(tokens[index].Marker))
{
index++;
if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text)
index++;
}

tokens.InsertRange(index, remarkTokens.Value);
}
}
tokens.InsertRange(index, remarkTokens);
}
}

Expand Down
Loading
Loading