Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
.vscode/*
*.test
*.test
bin/
obj/
packages/
.vs/
34 changes: 34 additions & 0 deletions dotnet/Jargon.Benchmark/Benchmarks/Lemmatizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
using BenchmarkDotNet.Attributes;
using System;
using System.IO;

namespace Jargon.Benchmark.Benchmarks
{
public class Lemmatizer
{
private static string Wikipedia;
[GlobalSetup]
public void LoadData()
{
var path = Path.Combine(Environment.CurrentDirectory, "testdata", "wikipedia.txt");
Wikipedia = File.ReadAllText(path);
}

[Benchmark]
public void LemmatizerBenchmark()
{
var dict = Data.StackExchange.Instance;
var lem = new Jargon.Lemmatizer(dict, 3);

using (var r = new StringReader(Wikipedia))
using (var tokens = new TextTokens(r))
using (var l = new LemmaTokens(in lem, tokens))
{
while (l.MoveNext())
{
// just go, don't use the results
}
}
}
}
}
30 changes: 30 additions & 0 deletions dotnet/Jargon.Benchmark/Benchmarks/Tokenize.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using BenchmarkDotNet.Attributes;
using System;
using System.IO;

namespace Jargon.Benchmark.Benchmarks
{
public class Tokenize
{
private static string Wikipedia;
[GlobalSetup]
public void LoadData()
{
var path = Path.Combine(Environment.CurrentDirectory, "testdata", "wikipedia.txt");
Wikipedia = File.ReadAllText(path);
}

[Benchmark]
public void TokenizeBenchmark()
{
using (var r = new StringReader(Wikipedia))
using (var t = new TextTokens(r))
{
while (t.MoveNext())
{
// just go, don't use the results
}
}
}
}
}
23 changes: 23 additions & 0 deletions dotnet/Jargon.Benchmark/Jargon.Benchmark.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
<LangVersion>7.2</LangVersion>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.11.1" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Jargon\Jargon.csproj" />
</ItemGroup>

<ItemGroup>
<None Update="testdata\wikipedia.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
21 changes: 21 additions & 0 deletions dotnet/Jargon.Benchmark/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Linq;
using System.Reflection;

namespace Jargon.Benchmark
{
class Program
{
static void Main(string[] args)
{
var config = ManualConfig.CreateEmpty().With(new MemoryDiagnoser()).With(DefaultConfig.Instance.GetColumnProviders().ToArray()).With(DefaultConfig.Instance.GetExporters().ToArray());
config = config.With(Job.RyuJitX64);
config = config.With(DefaultConfig.Instance.GetLoggers().ToArray());

BenchmarkRunner.Run(Assembly.GetExecutingAssembly(), config);
}
}
}
File renamed without changes.
24 changes: 24 additions & 0 deletions dotnet/Jargon.Cmd/CompressedWebClient.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
using System;
using System.Net;

namespace Jargon.Cmd
{
internal sealed class CompressedWebClient: WebClient
{
protected override WebRequest GetWebRequest(Uri address)
{
var @base = base.GetWebRequest(address);
if(@base is HttpWebRequest http)
{
// be a good citizen
http.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
// indicate who we are
http.UserAgent = "Jargon";
// people _still_ screw this HTTP 1.1 thing up, sigh
http.Pipelined = false;
}

return @base;
}
}
}
17 changes: 17 additions & 0 deletions dotnet/Jargon.Cmd/Jargon.Cmd.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
<LangVersion>7.2</LangVersion>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="CommandLineParser" Version="2.3.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Jargon\Jargon.csproj" />
</ItemGroup>

</Project>
137 changes: 137 additions & 0 deletions dotnet/Jargon.Cmd/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
using CommandLine;
using System;
using System.Diagnostics;
using System.IO;

namespace Jargon.Cmd
{
public sealed class Program
{
private sealed class Options
{
[Option('f', Required = false, HelpText = "A file path to lemmatize")]
public string File { get; set; }

internal bool FileIsSet => !string.IsNullOrEmpty(File);

[Option('s', Required = false, HelpText = "A (quoted) string to lemmatize")]
public string String { get; set; }

internal bool StringIsSet => !string.IsNullOrEmpty(String);

[Option('u', Required = false, HelpText = "A URL to fetch and lemmatize")]
public string Url { get; set; }

internal bool UrlIsSet => !string.IsNullOrEmpty(Url);
}

public static void Main(string[] args)
{
var res = Parser.Default.ParseArguments<Options>(args);
res.WithParsed(HandleOptions).WithNotParsed(e => PrintExamplesAndExit());
}

private static void HandleOptions(Options opts)
{
var numSet =
(opts.FileIsSet ? 1 : 0) +
(opts.StringIsSet ? 1 : 0) +
(opts.UrlIsSet ? 1 : 0);
if (numSet > 1)
{
Console.WriteLine($"Only one of `f`, `s`, and `u` may be set");
Environment.Exit(-2);
}

if (opts.FileIsSet)
{
if (!File.Exists(opts.File))
{
Console.WriteLine($"Could not find file: {opts.File}");
Environment.Exit(-3);
}

try
{
using (var reader = new StreamReader(File.OpenRead(opts.File)))
{
Lemmatize(reader);
return;
}
}
catch(Exception e)
{
Console.WriteLine($"Could not read file ({e.Message}): {opts.File}");
Environment.Exit(-4);
}
}

if (opts.StringIsSet)
{
using (var reader = new StringReader(opts.String))
{
Lemmatize(reader);
return;
}
}

if (opts.UrlIsSet)
{
try
{
using (var web = new CompressedWebClient())
{
var html = web.DownloadString(opts.Url);
using (var reader = new StringReader(html))
{
Lemmatize(reader);
}
return;
}
}
catch (Exception e)
{
Console.WriteLine($"Could not download url ({e.Message}): {opts.Url}");
Environment.Exit(-5);
}
}

Lemmatize(Console.In);
}

private static void Lemmatize(TextReader reader)
{
var lemmatizer = new Lemmatizer(Data.StackExchange.Instance, 3);
using(var toks = new TextTokens(reader))
using(var e = new LemmaTokens(in lemmatizer, toks))
{
while (e.MoveNext())
{
Console.Write(e.Current.Value);
}
}
}

private static void PrintExamplesAndExit()
{
string cmdName;
using (var proc = Process.GetCurrentProcess())
{
cmdName = proc.ProcessName;
}

var pathSeparator = Path.DirectorySeparatorChar;

Console.WriteLine($@"
Usage: {cmdName} accepts piped UTF8 text from tools such as cat, curl or echo, via Stdin

Example: echo ""I luv Rails"" | {cmdName}
Alternatively, use {cmdName} 'standalone' by passing flags for text sources:
Example: %s -f {pathSeparator}path{pathSeparator}to{pathSeparator}file.txt

Results are piped to Stdout (regardless of input)");

Environment.Exit(-1);
}
}
}
8 changes: 8 additions & 0 deletions dotnet/Jargon.Cmd/Properties/launchSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"profiles": {
"Jargon.Cmd": {
"commandName": "Project",
"commandLineArgs": "-u \"https://stackoverflow.com/\""
}
}
}
41 changes: 41 additions & 0 deletions dotnet/Jargon.Tests/HtmlTokensTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using System.Collections.Generic;
using System.IO;
using Xunit;
using System.Linq;

namespace Jargon.Tests
{
public class HtmlTokensTests
{
[Theory]
[InlineData("<html>\n<p foo=\"bar\">\nHi! Let's talk Ruby on Rails.\n<!-- Ignore ASPNET MVC in comments -->\n</p>\n</html>\n",
new[]
{
"<p foo=\"bar\">", // tags kept whole
"\n", // whitespace preserved
"Hi", "!",
"Ruby", "on", "Rails", // make sure text node got tokenized
"<!-- Ignore ASPNET MVC in comments -->", // make sure comment kept whole
"</p>",
}
)]
public void Tokenize(string input, string[] expectedTokens)
{
var got = new List<Token>();
using (var reader = new StringReader(input))
using (var e = new HTMLTokens(reader))
{
while (e.MoveNext())
{
got.Add(e.Current);
}
}

foreach(var e in expectedTokens)
{
var matching = got.Where(g => g.Value == e).ToList();
Assert.True(matching.Count > 0);
}
}
}
}
21 changes: 21 additions & 0 deletions dotnet/Jargon.Tests/Jargon.Tests.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>

<IsPackable>false</IsPackable>

<LangVersion>7.2</LangVersion>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.8.0" />
<PackageReference Include="xunit" Version="2.3.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.3.1" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Jargon\Jargon.csproj" />
</ItemGroup>

</Project>
Loading