From 1459ea9a0d1fb4a513d5bfaefbfc6e9aa2f54296 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Sat, 25 Oct 2025 22:00:55 +0900 Subject: [PATCH 1/8] Upgrade to Elasticsearch 9.1.4. As part of this: - Upgrade Java to JVM 21 - Upgrade Gradle to 8.11 - Fix all deprecations - Fix integrations tests on Elasticsearch and re-enable them --- .github/workflows/build.yml | 3 +- README.md | 19 +++---- build.gradle | 19 ++++--- .../worksap/nlp/tools/EsConventions.groovy | 2 +- .../worksap/nlp/tools/EsTestEnvPlugin.groovy | 9 +++- .../com/worksap/nlp/tools/engines.groovy | 3 ++ gradle.properties | 6 +-- gradle/wrapper/gradle-wrapper.properties | 2 +- integration/build.gradle | 33 +++++++++--- .../es-9.00-ge/SudachiInSearchEngineEnv.kt | 53 +++++++++++++++++++ .../es-9.00-lt}/SecurityManagerTest.kt | 0 .../SudachiInSearchEngineEnv.kt | 2 +- spi/build.gradle | 5 ++ src/main/ext/es-8.12-lt/factory-adapters.kt | 45 ++++++++++++++++ src/main/ext/es-9.00-ge/factory-adapters.kt | 45 ++++++++++++++++ .../ext/es-9.00-ge/search-engine-aliases.kt | 27 ++++++++++ .../factory-adapters.kt | 2 +- .../elasticsearch/sudachi/ConfigAdapter.kt | 3 +- subplugin/build.gradle | 7 ++- testlib/build.gradle | 9 +++- 20 files changed, 256 insertions(+), 38 deletions(-) create mode 100644 integration/src/test/ext/es-9.00-ge/SudachiInSearchEngineEnv.kt rename integration/src/test/{java/com/worksap/nlp/elasticsearch/sudachi => ext/es-9.00-lt}/SecurityManagerTest.kt (100%) rename integration/src/test/ext/{es-8.12-ge => es-9.00-lt}/SudachiInSearchEngineEnv.kt (95%) create mode 100644 src/main/ext/es-8.12-lt/factory-adapters.kt create mode 100644 src/main/ext/es-9.00-ge/factory-adapters.kt create mode 100644 src/main/ext/es-9.00-ge/search-engine-aliases.kt rename src/main/ext/{es-8.04-ge => es-9.00-lt}/factory-adapters.kt (96%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 226f6e92..18aa065d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,6 +14,7 @@ jobs: strategy: matrix: es-version: + - 'es:9.1.4' - 'es:8.15.2' - 'es:8.14.3' - 'es:8.13.4' @@ -46,7 +47,7 @@ jobs: - 'os:2.7.0' - 'os:2.6.0' env: - mainJob: ${{ matrix.es-version == 'es:8.15.2' }} + mainJob: ${{ matrix.es-version == 'es:9.1.4' }} sudachiVersion: 20241021 sudachiKind: core continue-on-error: true diff --git a/README.md b/README.md index a65e86b8..70c602a8 100644 --- a/README.md +++ b/README.md @@ -18,19 +18,20 @@ Check [changelog](./CHANGELOG.md) for more. 1. Build analysis-sudachi. ``` - $ ./gradlew -PengineVersion=es:8.15.2 build + $ ./gradlew -PengineVersion=es:9.1.4 build ``` Use `-PengineVersion=os:2.18.0` for OpenSearch. ## Supported ElasticSearch versions -1. 8.0.* until 8.15.* supported, integration tests in CI -2. 7.17.* (latest patch version) - supported, integration tests in CI -3. 7.11.* until 7.16.* - best effort support, not tested in CI -4. 7.10.* integration tests for the latest patch version -5. 7.9.* and below - not tested in CI at all, may be broken -6. 7.3.* and below - broken, not supported +1. 9.0.* until 9.1.* supported, integration tests in CI +2. 8.0.* until 8.15.* supported, integration tests in CI +3. 7.17.* (latest patch version) - supported, integration tests in CI +4. 7.11.* until 7.16.* - best effort support, not tested in CI +5. 7.10.* integration tests for the latest patch version +6. 7.9.* and below - not tested in CI at all, may be broken +7. 7.3.* and below - broken, not supported ## Supported OpenSearch versions @@ -43,11 +44,11 @@ Use `-PengineVersion=os:2.18.0` for OpenSearch. a. Using the release package ``` - $ bin/elasticsearch-plugin install https://github.com/WorksApplications/elasticsearch-sudachi/releases/download/v3.1.1/analysis-sudachi-8.13.4-3.1.1.zip + $ bin/elasticsearch-plugin install https://github.com/WorksApplications/elasticsearch-sudachi/releases/download/v3.1.1/analysis-sudachi-9.1.4-3.1.1.zip ``` b. Using self-build package ``` - $ bin/elasticsearch-plugin install file:///path/to/analysis-sudachi-8.13.4-3.1.1.zip + $ bin/elasticsearch-plugin install file:///path/to/analysis-sudachi-9.1.4-3.1.1.zip ``` (Specify the absolute path in URI format) 3. Download sudachi dictionary archive from https://github.com/WorksApplications/SudachiDict diff --git a/build.gradle b/build.gradle index 484df01b..a8296a6a 100644 --- a/build.gradle +++ b/build.gradle @@ -2,9 +2,9 @@ import org.jetbrains.kotlin.gradle.dsl.JvmTarget plugins { id 'java-library' - id 'org.jetbrains.kotlin.jvm' version '1.8.0' - id "org.jetbrains.kotlin.plugin.serialization" version "1.8.0" - id 'com.diffplug.spotless' version '6.16.0' + id 'org.jetbrains.kotlin.jvm' version '2.0.0' + id "org.jetbrains.kotlin.plugin.serialization" version '2.0.0' + id 'com.diffplug.spotless' version '6.25.0' id 'org.sonarqube' version '4.0.0.2929' id("org.jetbrains.kotlinx.kover") version "0.7.0" id 'com.worksap.nlp.sudachi.esc' @@ -13,15 +13,20 @@ plugins { } group = 'com.worksap.nlp' -archivesBaseName = 'analysis-sudachi' +base.archivesName.set('analysis-sudachi') version = properties["pluginVersion"] +java { + sourceCompatibility = JavaVersion.VERSION_21 + targetCompatibility = JavaVersion.VERSION_21 +} + compileKotlin { - compilerOptions.jvmTarget.set(JvmTarget.JVM_11) + compilerOptions.jvmTarget.set(JvmTarget.JVM_21) } compileTestKotlin { - compilerOptions.jvmTarget.set(JvmTarget.JVM_11) + compilerOptions.jvmTarget.set(JvmTarget.JVM_21) } configurations { @@ -84,7 +89,7 @@ def packageSpiJars = tasks.register('packageSpiJars', Copy) { def distZip = tasks.register('distZip', Zip) { var esKind = sudachiEs.kind.get() dependsOn embedVersion, packageJars, packageSpiJars - archiveBaseName.set("${esKind.engine.kind}-${esKind.version}-$archivesBaseName") + archiveBaseName.set("${esKind.engine.kind}-${esKind.version}-${base.archivesName.get()}") from("build/package/${version}/${esKind.engine.kind}-${esKind.version}", 'LICENSE', 'README.md') } diff --git a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsConventions.groovy b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsConventions.groovy index d503d671..773d3183 100644 --- a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsConventions.groovy +++ b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsConventions.groovy @@ -8,7 +8,7 @@ class EsConventions implements Plugin { @Override void apply(Project target) { target.tasks.withType(JavaCompile).configureEach { - options.release.set(11) + options.release.set(21) options.encoding = 'UTF-8' } diff --git a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy index b241ad2f..bfea1f94 100644 --- a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy +++ b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy @@ -103,6 +103,10 @@ class StringProvider implements Provider, Serializable { throw new IllegalStateException("not implemented") } + @Override + Provider filter(org.gradle.api.specs.Spec spec) { + throw new IllegalStateException("not implemented") + } @Override String toString() { @@ -153,9 +157,10 @@ class EsTestEnvPlugin implements Plugin { target.gradle.taskGraph.whenReady { boolean shouldRun = false if (target.plugins.findPlugin(EsSudachiPlugin.class) != null) { - shouldRun = shouldTestsRun(target.extensions.getByType(EsExtension).kind.get()) + def kind = target.extensions.getByType(EsExtension).kind.get() + shouldRun = shouldTestsRun(kind) } - target.tasks.findAll().forEach { Task task -> + target.tasks.withType(Test).forEach { Test task -> task.onlyIf { shouldRun } } } diff --git a/buildSrc/src/main/groovy/com/worksap/nlp/tools/engines.groovy b/buildSrc/src/main/groovy/com/worksap/nlp/tools/engines.groovy index 4bdfb308..1579b3a0 100644 --- a/buildSrc/src/main/groovy/com/worksap/nlp/tools/engines.groovy +++ b/buildSrc/src/main/groovy/com/worksap/nlp/tools/engines.groovy @@ -13,6 +13,7 @@ enum EsSupport implements EngineSupport { Es84("es-8.04"), Es810("es-8.10"), Es812("es-8.12"), + Es90("es-9.00"), String tag List keys @@ -41,6 +42,8 @@ enum EsSupport implements EngineSupport { return Es810 } else if (vers.ge(8, 12) && vers.lt(9, 0)) { return Es812 + } else if (vers.ge(9, 0)) { + return Es90 } else { throw new IllegalArgumentException("unsupported ElasticSearch version: " + vers.raw) } diff --git a/gradle.properties b/gradle.properties index 2e70ac41..d99d321a 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,8 +1,8 @@ -# elasticsearch versions: 8.15.2, 8.14.3, 8.13.4, 8.12.2, 8.11.4, 8.10.4, 8.9.2, +# elasticsearch versions: 9.1.4, 8.15.2, 8.14.3, 8.13.4, 8.12.2, 8.11.4, 8.10.4, 8.9.2, # 8.8.1, 8.6.2, 8.5.3, 8.4.3, 8.2.3, 7.17.24, 7.14.2, 7.10.2 -# opensearch version: 2.18.0, 2.17.1, 2.16.0, 2.15.0, 2.14.0, 2.13.0, 2.12.0, 2.11.1, +# opensearch version: 2.18.0, 2.17.1, 2.16.0, 2.15.0, 2.14.0, 2.13.0, 2.12.0, 2.11.1, # 2.10.0, 2.9.0, 2.8.0, 2.7.0, 2.6.0 -engineVersion=es:8.15.2 +engineVersion=es:9.1.4 org.gradle.jvmargs=-XX:MaxMetaspaceSize=350m \ --add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED \ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 37aef8d3..21d5e095 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.1.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip networkTimeout=10000 zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/integration/build.gradle b/integration/build.gradle index ea7f65a6..199d5adb 100644 --- a/integration/build.gradle +++ b/integration/build.gradle @@ -13,22 +13,33 @@ plugins { version = properties["pluginVersion"] +java { + sourceCompatibility = JavaVersion.VERSION_21 + targetCompatibility = JavaVersion.VERSION_21 +} + compileKotlin { - compilerOptions.jvmTarget.set(JvmTarget.JVM_11) + compilerOptions.jvmTarget.set(JvmTarget.JVM_21) } compileTestKotlin { - compilerOptions.jvmTarget.set(JvmTarget.JVM_11) + compilerOptions.jvmTarget.set(JvmTarget.JVM_21) } configurations { buildSudachiDict } dependencies { buildSudachiDict (project(':spi')) - compileOnly(project(':')) - compileOnly(project(':spi')) - testCompileOnly(project(':testlib')) - testCompileOnly(project(':subplugin')) + testImplementation(project(':')) + testImplementation(project(':spi')) + testImplementation(project(':testlib')) + testImplementation(project(':subplugin')) + // Add ICU plugin JARs for ES 9.x dynamic loading + testRuntimeOnly files({ + fileTree(dir: new File(project.buildDir, "cache/icu-plugin-extracted"), include: '**/*.jar') + }) { + builtBy 'extractIcuPlugin' + } testImplementation('junit:junit:4.13.1') { transitive = false } @@ -65,6 +76,12 @@ def downloadIcuPlugin = tasks.register('downloadIcuPlugin', Download.class) { overwrite(false) } +def extractIcuPlugin = tasks.register('extractIcuPlugin', Copy) { + dependsOn downloadIcuPlugin + from zipTree(downloadIcuPlugin.get().dest) + into new File(project.buildDir, "cache/icu-plugin-extracted") +} + esTestEnv { def esKind = sudachiEs.kind.get() def packageDir = rootDir.toPath().resolve("build/package/${version}/${esKind.engine.kind}-${esKind.version}") @@ -77,7 +94,6 @@ esTestEnv { } test { - onlyIf { ! (sudachiEs.isEs() && sudachiEs.kind.get().parsedVersion().ge(8, 9)) } dependsOn( ':packageJars', ':packageSpiJars', @@ -86,6 +102,7 @@ test { compileSystemDictionary, ':testlib:jar', downloadIcuPlugin, + extractIcuPlugin, ':subplugin:distZip' ) systemProperty("tests.security.manager", true) @@ -94,7 +111,7 @@ test { def distZip = tasks.register('distZip', Zip) { var esKind = sudachiEs.kind.get() - archiveBaseName.set("${esKind.engine.kind}-${esKind.version}-$archivesBaseName") + archiveBaseName.set("${esKind.engine.kind}-${esKind.version}-${project.name}") from( project(':subplugin').packageJars.outputs.files, project(':subplugin').embedVersion.outputs.files, diff --git a/integration/src/test/ext/es-9.00-ge/SudachiInSearchEngineEnv.kt b/integration/src/test/ext/es-9.00-ge/SudachiInSearchEngineEnv.kt new file mode 100644 index 00000000..519e69b7 --- /dev/null +++ b/integration/src/test/ext/es-9.00-ge/SudachiInSearchEngineEnv.kt @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022-2025 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.elasticsearch.sudachi + +import org.elasticsearch.indices.analysis.AnalysisModule +import org.elasticsearch.plugins.AnalysisPlugin +import org.elasticsearch.plugins.scanners.StablePluginsRegistry + +// For ES 9.x tests, we manually instantiate plugins since PluginsService API changed +class TestPluginsService(private val plugins: List) { + fun filterPlugins(clazz: Class): List = plugins.filterIsInstance(clazz) +} + +typealias PluginsServiceAlias = TestPluginsService + +private fun loadTestPlugins(): List { + return listOfNotNull( + tryLoadPlugin("com.worksap.nlp.elasticsearch.sudachi.plugin.AnalysisSudachiPlugin", true), + tryLoadPlugin("org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin", false)) +} + +private fun tryLoadPlugin(className: String, requiresSettings: Boolean): AnalysisPlugin? { + return try { + val clazz = Class.forName(className) + if (requiresSettings) { + clazz.getConstructor(org.elasticsearch.common.settings.Settings::class.java).newInstance(null) + as AnalysisPlugin + } else { + clazz.getDeclaredConstructor().newInstance() as AnalysisPlugin + } + } catch (e: Exception) { + null // Plugin not available or failed to load + } +} + +fun SudachiInSearchEngineEnv.makePluginService() = TestPluginsService(loadTestPlugins()) + +fun SudachiInSearchEngineEnv.makeAnalysisModule() = + AnalysisModule(environment(), loadTestPlugins(), StablePluginsRegistry()) diff --git a/integration/src/test/java/com/worksap/nlp/elasticsearch/sudachi/SecurityManagerTest.kt b/integration/src/test/ext/es-9.00-lt/SecurityManagerTest.kt similarity index 100% rename from integration/src/test/java/com/worksap/nlp/elasticsearch/sudachi/SecurityManagerTest.kt rename to integration/src/test/ext/es-9.00-lt/SecurityManagerTest.kt diff --git a/integration/src/test/ext/es-8.12-ge/SudachiInSearchEngineEnv.kt b/integration/src/test/ext/es-9.00-lt/SudachiInSearchEngineEnv.kt similarity index 95% rename from integration/src/test/ext/es-8.12-ge/SudachiInSearchEngineEnv.kt rename to integration/src/test/ext/es-9.00-lt/SudachiInSearchEngineEnv.kt index c141aa95..25388091 100644 --- a/integration/src/test/ext/es-8.12-ge/SudachiInSearchEngineEnv.kt +++ b/integration/src/test/ext/es-9.00-lt/SudachiInSearchEngineEnv.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024 Works Applications Co., Ltd. + * Copyright (c) 2022-2025 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spi/build.gradle b/spi/build.gradle index 8c2620cd..ea0a7511 100644 --- a/spi/build.gradle +++ b/spi/build.gradle @@ -11,6 +11,11 @@ group = 'com.worksap.nlp' version = properties["pluginVersion"] description = "Plugin interface for Sudachi search engine integrations (ElasticSearch and OpenSearch)" +java { + sourceCompatibility = JavaVersion.VERSION_21 + targetCompatibility = JavaVersion.VERSION_21 +} + dependencies { api('com.worksap.nlp:sudachi:0.7.4') } diff --git a/src/main/ext/es-8.12-lt/factory-adapters.kt b/src/main/ext/es-8.12-lt/factory-adapters.kt new file mode 100644 index 00000000..15879ded --- /dev/null +++ b/src/main/ext/es-8.12-lt/factory-adapters.kt @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022-2025 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@file:Suppress("UNUSED_PARAMETER", "PackageDirectoryMismatch") + +package com.worksap.nlp.search.aliases + +import org.apache.lucene.analysis.Analyzer +import org.elasticsearch.common.settings.Settings +import org.elasticsearch.index.IndexSettings + +abstract class AbstractTokenizerFactory( + indexSettings: IndexSettings?, + environment: Environment?, + name: String?, + settings: Settings? +) : org.elasticsearch.index.analysis.AbstractTokenizerFactory(indexSettings, settings, name) + +abstract class AbstractTokenFilterFactory( + indexSettings: IndexSettings?, + environment: Environment?, + name: String?, + settings: Settings? +) : org.elasticsearch.index.analysis.AbstractTokenFilterFactory(name, settings) + +abstract class AbstractIndexAnalyzerProvider( + indexSettings: IndexSettings?, + environment: Environment?, + name: String?, + settings: Settings? +) : org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider(name, settings) + diff --git a/src/main/ext/es-9.00-ge/factory-adapters.kt b/src/main/ext/es-9.00-ge/factory-adapters.kt new file mode 100644 index 00000000..2d2202d2 --- /dev/null +++ b/src/main/ext/es-9.00-ge/factory-adapters.kt @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022-2025 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@file:Suppress("UNUSED_PARAMETER", "PackageDirectoryMismatch") + +package com.worksap.nlp.search.aliases + +import org.apache.lucene.analysis.Analyzer +import org.elasticsearch.common.settings.Settings +import org.elasticsearch.env.Environment +import org.elasticsearch.index.IndexSettings + +abstract class AbstractTokenizerFactory( + indexSettings: IndexSettings?, + environment: Environment?, + name: String?, + settings: Settings? +) : org.elasticsearch.index.analysis.AbstractTokenizerFactory(name!!) + +abstract class AbstractTokenFilterFactory( + indexSettings: IndexSettings?, + environment: Environment?, + name: String?, + settings: Settings? +) : org.elasticsearch.index.analysis.AbstractTokenFilterFactory(name!!) + +abstract class AbstractIndexAnalyzerProvider( + indexSettings: IndexSettings?, + environment: Environment?, + name: String?, + settings: Settings? +) : org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider(name!!) diff --git a/src/main/ext/es-9.00-ge/search-engine-aliases.kt b/src/main/ext/es-9.00-ge/search-engine-aliases.kt new file mode 100644 index 00000000..971f731a --- /dev/null +++ b/src/main/ext/es-9.00-ge/search-engine-aliases.kt @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2022-2025 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@file:Suppress("PackageDirectoryMismatch") +@file:JvmName("SearchEngineAliasesEs9") + +package com.worksap.nlp.search.aliases + +/** ES 9.0+ specific extensions: Environment.configFile() was changed to Environment.configDir() */ +import java.nio.file.Path + +fun org.elasticsearch.env.Environment.configFile(): Path { + return this.configDir() +} diff --git a/src/main/ext/es-8.04-ge/factory-adapters.kt b/src/main/ext/es-9.00-lt/factory-adapters.kt similarity index 96% rename from src/main/ext/es-8.04-ge/factory-adapters.kt rename to src/main/ext/es-9.00-lt/factory-adapters.kt index 75b53414..4e99d8e0 100644 --- a/src/main/ext/es-8.04-ge/factory-adapters.kt +++ b/src/main/ext/es-9.00-lt/factory-adapters.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024 Works Applications Co., Ltd. + * Copyright (c) 2022-2025 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt index c4f72117..6d944acb 100644 --- a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt +++ b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024 Works Applications Co., Ltd. + * Copyright (c) 2022-2025 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ package com.worksap.nlp.elasticsearch.sudachi import com.worksap.nlp.search.aliases.Environment import com.worksap.nlp.search.aliases.Settings +import com.worksap.nlp.search.aliases.configFile import com.worksap.nlp.sudachi.Config import com.worksap.nlp.sudachi.PathAnchor import com.worksap.nlp.sudachi.Tokenizer.SplitMode diff --git a/subplugin/build.gradle b/subplugin/build.gradle index 211d891b..ef9804ba 100644 --- a/subplugin/build.gradle +++ b/subplugin/build.gradle @@ -8,6 +8,11 @@ plugins { group = 'com.worksap.nlp' version = properties["pluginVersion"] +java { + sourceCompatibility = JavaVersion.VERSION_21 + targetCompatibility = JavaVersion.VERSION_21 +} + dependencies { compileOnly(project(':spi')) } @@ -36,7 +41,7 @@ def packageJars = tasks.register('packageJars', Copy) { def distZip = tasks.register('distZip', Zip) { var esKind = sudachiEs.kind.get() dependsOn embedVersion, packageJars - archiveBaseName.set("${esKind.engine.kind}-${esKind.version}-$archivesBaseName") + archiveBaseName.set("${esKind.engine.kind}-${esKind.version}-${project.name}") from("build/package/${version}/${esKind.engine.kind}-${esKind.version}") } diff --git a/testlib/build.gradle b/testlib/build.gradle index 0bc42215..025ab702 100644 --- a/testlib/build.gradle +++ b/testlib/build.gradle @@ -11,12 +11,17 @@ plugins { version = properties["pluginVersion"] +java { + sourceCompatibility = JavaVersion.VERSION_21 + targetCompatibility = JavaVersion.VERSION_21 +} + compileKotlin { - compilerOptions.jvmTarget.set(JvmTarget.JVM_11) + compilerOptions.jvmTarget.set(JvmTarget.JVM_21) } compileTestKotlin { - compilerOptions.jvmTarget.set(JvmTarget.JVM_11) + compilerOptions.jvmTarget.set(JvmTarget.JVM_21) } dependencies { From 0ddb0191654b25a406d2b12998d66ebaa9c8e9b3 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:13:55 +0900 Subject: [PATCH 2/8] - Fix entitlement policy - Move sudachi lib requirement as a direct implementation dep (otherwise breaks entitlement policy) - Add entitlement policy test --- build.gradle | 25 ++-- .../elasticsearch/sudachi/EntitlementTest.kt | 124 ++++++++++++++++++ src/main/extras/entitlement-policy.yaml | 11 ++ 3 files changed, 148 insertions(+), 12 deletions(-) create mode 100644 integration/src/test/java/com/worksap/nlp/elasticsearch/sudachi/EntitlementTest.kt create mode 100644 src/main/extras/entitlement-policy.yaml diff --git a/build.gradle b/build.gradle index a8296a6a..d855257d 100644 --- a/build.gradle +++ b/build.gradle @@ -44,7 +44,9 @@ sourceSets { } dependencies { - spi(project(':spi')) + implementation(project(':spi')) + implementation('com.worksap.nlp:sudachi:0.7.4') + testImplementation(project(':testlib')) testImplementation('org.apache.logging.log4j:log4j-core:2.17.2') testImplementation('org.jetbrains.kotlin:kotlin-test-junit') { @@ -57,13 +59,16 @@ dependencies { def embedVersion = tasks.register('embedVersion', Copy) { var esKind = sudachiEs.kind.get() - from 'src/main/extras/plugin-descriptor.properties' + from('src/main/extras/plugin-descriptor.properties') { + expand([ + version: version, + engineVersion: esKind.version, + engineKind: esKind.engine.kind + ]) + } + // Include entitlement policy for Elasticsearch 9+ + from('src/main/extras/entitlement-policy.yaml') into "build/package/${version}/${esKind.engine.kind}-${esKind.version}" - expand([ - version: version, - engineVersion: esKind.version, - engineKind: esKind.engine.kind - ]) inputs.property("version", version) inputs.property("elasticSearchVersion", esKind.version) } @@ -79,11 +84,7 @@ def packageJars = tasks.register('packageJars', Copy) { def packageSpiJars = tasks.register('packageSpiJars', Copy) { from configurations.spi var esKind = sudachiEs.kind.get() - if (sudachiEs.hasPluginSpiSupport()) { - into "build/package/${version}/${esKind.engine.kind}-${esKind.version}/spi" - } else { - into "build/package/${version}/${esKind.engine.kind}-${esKind.version}" - } + into "build/package/${version}/${esKind.engine.kind}-${esKind.version}" } def distZip = tasks.register('distZip', Zip) { diff --git a/integration/src/test/java/com/worksap/nlp/elasticsearch/sudachi/EntitlementTest.kt b/integration/src/test/java/com/worksap/nlp/elasticsearch/sudachi/EntitlementTest.kt new file mode 100644 index 00000000..3a94f24a --- /dev/null +++ b/integration/src/test/java/com/worksap/nlp/elasticsearch/sudachi/EntitlementTest.kt @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2025 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.elasticsearch.sudachi + +import kotlin.io.path.exists +import kotlin.test.Test +import org.junit.Assert + +/** + * Integration test to verify that the entitlement policy grants proper permissions for the plugin + * to operate in Elasticsearch 9+. + * + * This test verifies: + * 1. Plugin and dictionary files are accessible + * 2. Default config can be read from plugin JAR (requires file read entitlement) + * 3. Tokenizer works end-to-end without NotEntitledException + * 4. Entitlement policy file is valid and parseable + */ +class EntitlementTest : SudachiEnvTest() { + + @Test + fun pluginAndDictionaryFilesAccessible() { + // Verify plugin directory structure and dictionary files are accessible + // This requires entitlement policy to grant read access to both plugin and config directories + val pluginDir = sudachiEnv.pluginsPath.resolve("analysis-sudachi") + Assert.assertTrue("Plugin directory should exist: $pluginDir", pluginDir.exists()) + + // Check SPI directory if it exists + val spiDir = pluginDir.resolve("spi") + if (spiDir.exists()) { + Assert.assertTrue("SPI directory should be readable: $spiDir", spiDir.toFile().canRead()) + } + + // Verify dictionary files in config/sudachi directory + val sudachiConfigDir = sudachiEnv.configPath.resolve("sudachi") + Assert.assertTrue("Config directory should exist: $sudachiConfigDir", sudachiConfigDir.exists()) + + val systemDict = sudachiConfigDir.resolve("system_core.dic") + Assert.assertTrue("System dictionary should exist: $systemDict", systemDict.exists()) + } + + @Test + fun tokenizerWorksWithDefaultConfig() { + // End-to-end test: Create tokenizer without settings_path (forces default config from JAR) + // This exercises the full entitlement chain: + // 1. Plugin loads without NotEntitledException + // 2. Read default config from plugin JAR + // 3. Read dictionary from config directory + // 4. Initialize and use tokenizer + + val req = AnalyzeActionRequestAlias("sudachi_test") + req.tokenizer( + mapOf( + "type" to "sudachi_tokenizer", + // No settings_path - forces Config.defaultConfig() call from JAR + "split_mode" to "C")) + req.text("東京都") + + val analyzers = analysisRegistry() + + // If entitlement is missing, this would throw NotEntitledException + val response = + TransportAnalyzeActionAlias.analyze( + req, + analyzers, + null, + 1000, + ) + + // Verify tokenization works correctly + Assert.assertTrue("Should tokenize successfully", response.tokens.isNotEmpty()) + Assert.assertEquals("東京都", response.tokens[0].term) + } + + @Test + fun entitlementPolicyIsValidAndParseable() { + // This test validates that: + // 1. The entitlement-policy.yaml file is packaged with the plugin + // 2. The policy would be accepted during real plugin installation + // It uses Elasticsearch's actual policy parser to ensure the syntax is correct + val pluginDir = sudachiEnv.pluginsPath.resolve("analysis-sudachi") + val entitlementPolicyFile = pluginDir.resolve("entitlement-policy.yaml") + + Assert.assertTrue( + "entitlement-policy.yaml must exist in plugin package: $entitlementPolicyFile", + entitlementPolicyFile.exists()) + + try { + // Use Elasticsearch's PolicyUtils to parse the policy file + // This is the same parser used during plugin installation + val policy = + org.elasticsearch.entitlement.runtime.policy.PolicyUtils.parsePolicyIfExists( + "analysis-sudachi", entitlementPolicyFile, true) + + // If we get here without an exception, the policy file is valid! + Assert.assertNotNull( + "Policy should be parsed successfully - this means the YAML syntax is correct and would be accepted during real plugin installation", + policy) + + println("SUCCESS: entitlement-policy.yaml parsed successfully by Elasticsearch!") + println("Policy name: ${policy.name}") + } catch (e: org.elasticsearch.entitlement.runtime.policy.PolicyParserException) { + Assert.fail( + "entitlement-policy.yaml has invalid syntax and would fail during plugin installation: ${e.message}") + } catch (e: Exception) { + Assert.fail( + "Unexpected error while parsing entitlement-policy.yaml: ${e.javaClass.simpleName}: ${e.message}") + } + } +} diff --git a/src/main/extras/entitlement-policy.yaml b/src/main/extras/entitlement-policy.yaml new file mode 100644 index 00000000..6141e9d1 --- /dev/null +++ b/src/main/extras/entitlement-policy.yaml @@ -0,0 +1,11 @@ +# Entitlement policy for analysis-sudachi plugin +# This file declares the permissions required by the plugin to operate +# See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/_entitlements.html + +# Grant file read access for the plugin and its dependencies +ALL-UNNAMED: + - files: + # Allow reading from the config directory for dictionaries + - relative_path: sudachi + relative_to: config + mode: read From 2b42f49ff57610bb788701b48b69eb0fa1c0414b Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 19:19:52 +0900 Subject: [PATCH 3/8] Fix loading char.def from unentitled filesystem --- .../elasticsearch/sudachi/ConfigAdapter.kt | 4 +- .../com/worksap/nlp/test/TestDictionary.kt | 4 + .../nlp/lucene/sudachi/ja/additional.json | 1 + .../worksap/nlp/lucene/sudachi/ja/char.def | 167 ++++++++++++++++++ 4 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def diff --git a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt index 6d944acb..ba6af4c2 100644 --- a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt +++ b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt @@ -28,7 +28,7 @@ import kotlin.io.path.exists class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) { private val basePath = resourcesPath(env, settings) - private val fullAnchor = PathAnchor.filesystem(basePath).andThen(anchor) + private val fullAnchor = PathAnchor.filesystem(basePath) val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true) // default false to let every morpheme have non-null span in the input text @@ -83,7 +83,7 @@ class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) { } private fun readDefaultConfig(root: Path, baseAnchor: PathAnchor): Config { - val anchor = PathAnchor.filesystem(root).andThen(baseAnchor) + val anchor = PathAnchor.filesystem(root) val resolved = root.resolve(DEFAULT_SETTINGS_FILENAME) val exists = try { diff --git a/src/test/java/com/worksap/nlp/test/TestDictionary.kt b/src/test/java/com/worksap/nlp/test/TestDictionary.kt index 6219bc01..a7ec5eb7 100644 --- a/src/test/java/com/worksap/nlp/test/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/test/TestDictionary.kt @@ -117,6 +117,7 @@ constructor( parts.add("system_core.dic") parts.add("sudachi.json") parts.add("unk.def") + parts.add("char.def") } parts } @@ -137,6 +138,9 @@ constructor( if (resources.contains("unk.def")) { ResourceUtil.copyResource("unk.def", sudachiFolder, false) } + if (resources.contains("char.def")) { + ResourceUtil.copyResource("char.def", sudachiFolder, false) + } if (resources.contains("system_core.dic")) { writeSystemDic(sudachiFolder.toPath().resolve("system_core.dic")) } diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json index 561e05d2..e02aed99 100644 --- a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json +++ b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json @@ -2,6 +2,7 @@ "systemDict" : "system_core.dic", "oovProviderPlugin" : [ { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", + "charDef" : "char.def", "unkDef" : "unk.def" }, { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def new file mode 100644 index 00000000..b9728b79 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def @@ -0,0 +1,167 @@ +# +# Japanese charcter category map +# +# $Id: char.def 9 2012-12-12 04:13:15Z togiso $; +# + +################################################################################### +# +# CHARACTER CATEGORY DEFINITION +# +# CATEGORY_NAME INVOKE GROUP LENGTH +# +# - CATEGORY_NAME: Name of category. you have to define DEFAULT class. +# - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon +# - GROUP: 1/0: make a new word by grouping the same chracter category +# - LENGTH: n: 1 to n length new words are added +# +DEFAULT 0 1 0 # DEFAULT is a mandatory category! +SPACE 0 1 0 +KANJI 0 0 2 +SYMBOL 1 1 0 +NUMERIC 1 1 0 +ALPHA 1 1 0 +HIRAGANA 0 1 2 +KATAKANA 1 1 2 +KANJINUMERIC 0 1 0 #change INVOKE 1->0 +GREEK 1 1 0 +CYRILLIC 1 1 0 + +################################################################################### +# +# CODE(UCS2) TO CATEGORY MAPPING +# + +# SPACE +0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE +0x000D SPACE +0x0009 SPACE +0x000B SPACE +0x000A SPACE + +# ASCII +0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ +0x0030..0x0039 NUMERIC #0-9 +0x003A..0x0040 SYMBOL #:;<=>?@ +0x0041..0x005A ALPHA #A-Z +0x005B..0x0060 SYMBOL #[\]^_` +0x0061..0x007A ALPHA #a-z +0x007B..0x007E SYMBOL #{|}~ + +# Latin +0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿ +0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö +0x00D7 SYMBOL # Latin 1 #× +0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö +0x00F7 SYMBOL # Latin 1 #÷ +0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ +0x0100..0x017F ALPHA # Latin Extended A +0x0180..0x0236 ALPHA # Latin Extended B +0x1E00..0x1EF9 ALPHA # Latin Extended Additional + +# CYRILLIC +0x0400..0x04F9 CYRILLIC #Ѐ->ӹ +0x0500..0x050F CYRILLIC # Cyrillic supplementary + +# GREEK +0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ + +# HIRAGANA +0x3041..0x309F HIRAGANA + +# KATAKANA +#0x30A1..0x30FF KATAKANA +0x30A1..0x30FA KATAKANA +0x30FC..0x30FF KATAKANA +0x31F0..0x31FF KATAKANA # Small KU .. Small RO +# 0x30FC KATAKANA HIRAGANA # ー +0x30A1 NOOOVBOW # Small A +0x30A3 NOOOVBOW +0x30A5 NOOOVBOW +0x30A7 NOOOVBOW +0x30A9 NOOOVBOW +0x30E3 NOOOVBOW +0x30E5 NOOOVBOW +0x30E7 NOOOVBOW +0x30EE NOOOVBOW +0x30FB..0x30FE NOOOVBOW + +# Half KATAKANA +0xFF66..0xFF9D KATAKANA +0xFF9E..0xFF9F KATAKANA + +# KANJI +0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement +0x2F00..0x2FD5 KANJI +0x3005 KANJI NOOOVBOW +0x3007 KANJI +0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention +#0x4E00..0x9FA5 KANJI +0x4E00..0x9FFF KANJI +0xF900..0xFA2D KANJI +0xFA30..0xFA6A KANJI + + +# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) +0x4E00 KANJINUMERIC KANJI +0x4E8C KANJINUMERIC KANJI +0x4E09 KANJINUMERIC KANJI +0x56DB KANJINUMERIC KANJI +0x4E94 KANJINUMERIC KANJI +0x516D KANJINUMERIC KANJI +0x4E03 KANJINUMERIC KANJI +0x516B KANJINUMERIC KANJI +0x4E5D KANJINUMERIC KANJI +0x5341 KANJINUMERIC KANJI +0x767E KANJINUMERIC KANJI +0x5343 KANJINUMERIC KANJI +0x4E07 KANJINUMERIC KANJI +0x5104 KANJINUMERIC KANJI +0x5146 KANJINUMERIC KANJI + +# ZENKAKU +0xFF10..0xFF19 NUMERIC +0xFF21..0xFF3A ALPHA +0xFF41..0xFF5A ALPHA +0xFF01..0xFF0F SYMBOL #!->/ +0xFF1A..0xFF20 SYMBOL #:->@ +0xFF3B..0xFF40 SYMBOL #[->` +0xFF5B..0xFF65 SYMBOL #{->・ +0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form + +# OTHER SYMBOLS +0x2000..0x206F SYMBOL # General Punctuation +0x2070..0x209F NUMERIC # Superscripts and Subscripts +0x20A0..0x20CF SYMBOL # Currency Symbols +0x20D0..0x20FF SYMBOL # Combining Diaritical Marks for Symbols +0x2100..0x214F SYMBOL # Letterlike Symbols +0x2150..0x218F NUMERIC # Number forms +0x2100..0x214B SYMBOL # Letterlike Symbols +0x2190..0x21FF SYMBOL # Arrow +0x2200..0x22FF SYMBOL # Mathematical Operators +0x2300..0x23FF SYMBOL # Miscellaneuos Technical +0x2460..0x24FF SYMBOL # Enclosed NUMERICs +0x2501..0x257F SYMBOL # Box Drawing +0x2580..0x259F SYMBOL # Block Elements +0x25A0..0x25FF SYMBOL # Geometric Shapes +0x2600..0x26FE SYMBOL # Miscellaneous Symbols +0x2700..0x27BF SYMBOL # Dingbats +0x27F0..0x27FF SYMBOL # Supplemental Arrows A +0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A +0x2800..0x28FF SYMBOL # Braille Patterns +0x2900..0x297F SYMBOL # Supplemental Arrows B +0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows +0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators +0x3300..0x33FF SYMBOL +0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months +0x3000..0x303F SYMBOL # CJK Symbol and Punctuation +0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms +0xFE50..0xFE6B SYMBOL # Small Form Variants + +# added 2006/3/13 +0x3007 SYMBOL KANJINUMERIC + +# added 2018/11/30 +0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks + +# END OF TABLE From 2cb948bcfb1f3afa2f661fdf6b7899fc645ce34a Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 19:27:36 +0900 Subject: [PATCH 4/8] Fix plugin loading test --- .../com/worksap/nlp/tools/EsTestEnvPlugin.groovy | 8 ++++++++ integration/build.gradle | 1 + src/test/java/com/worksap/nlp/test/TestDictionary.kt | 2 +- .../nlp/lucene/sudachi/ja/sudachi_subplugin.json | 10 ++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json diff --git a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy index bfea1f94..40dad308 100644 --- a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy +++ b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy @@ -28,10 +28,15 @@ class EsTestEnvExtension { Path configFile = null List additionalJars = new ArrayList<>() List additionalPlugins = new ArrayList<>() + List additionalConfigFiles = new ArrayList<>() void addPlugin(String name, Object value) { additionalPlugins.add(new PluginDescriptor(name: name, value: value)) } + + void addConfigFile(Path configFile) { + additionalConfigFiles.add(configFile) + } } class PluginDescriptor { @@ -209,6 +214,9 @@ class EsTestEnvPlugin implements Plugin { Files.createDirectories(sudachiConfigDir) Files.copy(ext.systemDic, sudachiConfigDir.resolve("system_core.dic")) Files.copy(ext.configFile, sudachiConfigDir.resolve("sudachi.json")) + for (Path additionalConfig in ext.additionalConfigFiles) { + Files.copy(additionalConfig, sudachiConfigDir.resolve(additionalConfig.fileName)) + } return rootPath } diff --git a/integration/build.gradle b/integration/build.gradle index 199d5adb..f656942c 100644 --- a/integration/build.gradle +++ b/integration/build.gradle @@ -91,6 +91,7 @@ esTestEnv { additionalJars.add(project(":testlib").getTasksByName('jar', false).first().outputs.files.singleFile.toPath()) addPlugin("analysis-icu", downloadIcuPlugin) addPlugin('sudachi-sub', project(':subplugin').getTasksByName('distZip', false).first()) + addConfigFile(rootProject.rootDir.toPath().resolve("src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json")) } test { diff --git a/src/test/java/com/worksap/nlp/test/TestDictionary.kt b/src/test/java/com/worksap/nlp/test/TestDictionary.kt index a7ec5eb7..3805c2a2 100644 --- a/src/test/java/com/worksap/nlp/test/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/test/TestDictionary.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024 Works Applications Co., Ltd. + * Copyright (c) 2022-2025 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json new file mode 100644 index 00000000..1b38cd5d --- /dev/null +++ b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json @@ -0,0 +1,10 @@ +{ + "systemDict" : "system_core.dic", + "oovProviderPlugin" : [ + { "class" : "com.worksap.nlp.elasticsearch.sudachi.plugin.FakeOovPlugin" } + ], + "pathRewritePlugin" : [ + { "class" : "com.worksap.nlp.sudachi.JoinOovPlugin", + "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ] } + ] +} From 6dd1813cfa50e85d902d4916a035fbe3476dc84c Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 19:35:37 +0900 Subject: [PATCH 5/8] Unify config file loading --- .../worksap/nlp/tools/EsTestEnvPlugin.groovy | 20 ++++++++++--------- integration/build.gradle | 4 ++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy index 40dad308..dd3273b6 100644 --- a/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy +++ b/buildSrc/src/main/groovy/com/worksap/nlp/tools/EsTestEnvPlugin.groovy @@ -24,18 +24,17 @@ import java.util.zip.ZipFile class EsTestEnvExtension { Path bundlePath = null - Path systemDic = null - Path configFile = null List additionalJars = new ArrayList<>() List additionalPlugins = new ArrayList<>() - List additionalConfigFiles = new ArrayList<>() + List configFiles = new ArrayList<>() void addPlugin(String name, Object value) { additionalPlugins.add(new PluginDescriptor(name: name, value: value)) } - void addConfigFile(Path configFile) { - additionalConfigFiles.add(configFile) + void addConfigFile(Path sourcePath, String targetName = null) { + def target = targetName ?: sourcePath.fileName.toString() + configFiles.add(new ConfigFileDescriptor(source: sourcePath, target: target)) } } @@ -54,6 +53,11 @@ class PluginDescriptor { } } +class ConfigFileDescriptor { + Path source + String target +} + class StringProvider implements Provider, Serializable { private static final long serialVersionUID = 42L String value @@ -212,10 +216,8 @@ class EsTestEnvPlugin implements Plugin { def sudachiConfigDir = configPath.resolve("sudachi") Files.createDirectories(sudachiConfigDir) - Files.copy(ext.systemDic, sudachiConfigDir.resolve("system_core.dic")) - Files.copy(ext.configFile, sudachiConfigDir.resolve("sudachi.json")) - for (Path additionalConfig in ext.additionalConfigFiles) { - Files.copy(additionalConfig, sudachiConfigDir.resolve(additionalConfig.fileName)) + for (ConfigFileDescriptor config in ext.configFiles) { + Files.copy(config.source, sudachiConfigDir.resolve(config.target)) } return rootPath diff --git a/integration/build.gradle b/integration/build.gradle index f656942c..576edea9 100644 --- a/integration/build.gradle +++ b/integration/build.gradle @@ -86,11 +86,11 @@ esTestEnv { def esKind = sudachiEs.kind.get() def packageDir = rootDir.toPath().resolve("build/package/${version}/${esKind.engine.kind}-${esKind.version}") bundlePath = packageDir - systemDic = compileSystemDictionary.get().outputs.files.singleFile.toPath() - configFile = rootProject.rootDir.toPath().resolve("src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json") additionalJars.add(project(":testlib").getTasksByName('jar', false).first().outputs.files.singleFile.toPath()) addPlugin("analysis-icu", downloadIcuPlugin) addPlugin('sudachi-sub', project(':subplugin').getTasksByName('distZip', false).first()) + addConfigFile(compileSystemDictionary.get().outputs.files.singleFile.toPath(), "system_core.dic") + addConfigFile(rootProject.rootDir.toPath().resolve("src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json"), "sudachi.json") addConfigFile(rootProject.rootDir.toPath().resolve("src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json")) } From addb438b94038d05bcb4f3a2a7a6b3ab6c5bdf95 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:33:52 +0900 Subject: [PATCH 6/8] Fix config file anchoring --- .../com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt index ba6af4c2..2c6d0313 100644 --- a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt +++ b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt @@ -83,7 +83,6 @@ class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) { } private fun readDefaultConfig(root: Path, baseAnchor: PathAnchor): Config { - val anchor = PathAnchor.filesystem(root) val resolved = root.resolve(DEFAULT_SETTINGS_FILENAME) val exists = try { @@ -92,9 +91,9 @@ class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) { false } return if (exists) { - Config.fromFile(resolved, anchor) + Config.fromFile(resolved, baseAnchor) } else { - Config.defaultConfig(anchor) + Config.defaultConfig(baseAnchor) } } From 4a1b6343b4c62a45fe65bd89728ece9a60f047b4 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:02:33 +0900 Subject: [PATCH 7/8] Load config files from the sudachi jar --- .../com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt index 2c6d0313..5d9ae00f 100644 --- a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt +++ b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt @@ -27,8 +27,10 @@ import java.nio.file.Path import kotlin.io.path.exists class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) { + // Try filesystem first (config/sudachi), then fallback to classpath (sudachi jar) private val basePath = resourcesPath(env, settings) - private val fullAnchor = PathAnchor.filesystem(basePath) + private val fullAnchor = + PathAnchor.filesystem(basePath).andThen(PathAnchor.classpath(ConfigAdapter::class.java)) val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true) // default false to let every morpheme have non-null span in the input text From da2b0c9de0216bbd93322fdca889ea921a8a1820 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:23:49 +0900 Subject: [PATCH 8/8] Revert unneeded test files --- .../com/worksap/nlp/test/TestDictionary.kt | 4 - .../nlp/lucene/sudachi/ja/additional.json | 1 - .../worksap/nlp/lucene/sudachi/ja/char.def | 167 ------------------ .../lucene/sudachi/ja/sudachi_subplugin.json | 10 -- 4 files changed, 182 deletions(-) delete mode 100644 src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def delete mode 100644 src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json diff --git a/src/test/java/com/worksap/nlp/test/TestDictionary.kt b/src/test/java/com/worksap/nlp/test/TestDictionary.kt index 3805c2a2..d82de34e 100644 --- a/src/test/java/com/worksap/nlp/test/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/test/TestDictionary.kt @@ -117,7 +117,6 @@ constructor( parts.add("system_core.dic") parts.add("sudachi.json") parts.add("unk.def") - parts.add("char.def") } parts } @@ -138,9 +137,6 @@ constructor( if (resources.contains("unk.def")) { ResourceUtil.copyResource("unk.def", sudachiFolder, false) } - if (resources.contains("char.def")) { - ResourceUtil.copyResource("char.def", sudachiFolder, false) - } if (resources.contains("system_core.dic")) { writeSystemDic(sudachiFolder.toPath().resolve("system_core.dic")) } diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json index e02aed99..561e05d2 100644 --- a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json +++ b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/additional.json @@ -2,7 +2,6 @@ "systemDict" : "system_core.dic", "oovProviderPlugin" : [ { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", - "charDef" : "char.def", "unkDef" : "unk.def" }, { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def deleted file mode 100644 index b9728b79..00000000 --- a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/char.def +++ /dev/null @@ -1,167 +0,0 @@ -# -# Japanese charcter category map -# -# $Id: char.def 9 2012-12-12 04:13:15Z togiso $; -# - -################################################################################### -# -# CHARACTER CATEGORY DEFINITION -# -# CATEGORY_NAME INVOKE GROUP LENGTH -# -# - CATEGORY_NAME: Name of category. you have to define DEFAULT class. -# - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon -# - GROUP: 1/0: make a new word by grouping the same chracter category -# - LENGTH: n: 1 to n length new words are added -# -DEFAULT 0 1 0 # DEFAULT is a mandatory category! -SPACE 0 1 0 -KANJI 0 0 2 -SYMBOL 1 1 0 -NUMERIC 1 1 0 -ALPHA 1 1 0 -HIRAGANA 0 1 2 -KATAKANA 1 1 2 -KANJINUMERIC 0 1 0 #change INVOKE 1->0 -GREEK 1 1 0 -CYRILLIC 1 1 0 - -################################################################################### -# -# CODE(UCS2) TO CATEGORY MAPPING -# - -# SPACE -0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE -0x000D SPACE -0x0009 SPACE -0x000B SPACE -0x000A SPACE - -# ASCII -0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ -0x0030..0x0039 NUMERIC #0-9 -0x003A..0x0040 SYMBOL #:;<=>?@ -0x0041..0x005A ALPHA #A-Z -0x005B..0x0060 SYMBOL #[\]^_` -0x0061..0x007A ALPHA #a-z -0x007B..0x007E SYMBOL #{|}~ - -# Latin -0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿ -0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö -0x00D7 SYMBOL # Latin 1 #× -0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö -0x00F7 SYMBOL # Latin 1 #÷ -0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ -0x0100..0x017F ALPHA # Latin Extended A -0x0180..0x0236 ALPHA # Latin Extended B -0x1E00..0x1EF9 ALPHA # Latin Extended Additional - -# CYRILLIC -0x0400..0x04F9 CYRILLIC #Ѐ->ӹ -0x0500..0x050F CYRILLIC # Cyrillic supplementary - -# GREEK -0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ - -# HIRAGANA -0x3041..0x309F HIRAGANA - -# KATAKANA -#0x30A1..0x30FF KATAKANA -0x30A1..0x30FA KATAKANA -0x30FC..0x30FF KATAKANA -0x31F0..0x31FF KATAKANA # Small KU .. Small RO -# 0x30FC KATAKANA HIRAGANA # ー -0x30A1 NOOOVBOW # Small A -0x30A3 NOOOVBOW -0x30A5 NOOOVBOW -0x30A7 NOOOVBOW -0x30A9 NOOOVBOW -0x30E3 NOOOVBOW -0x30E5 NOOOVBOW -0x30E7 NOOOVBOW -0x30EE NOOOVBOW -0x30FB..0x30FE NOOOVBOW - -# Half KATAKANA -0xFF66..0xFF9D KATAKANA -0xFF9E..0xFF9F KATAKANA - -# KANJI -0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement -0x2F00..0x2FD5 KANJI -0x3005 KANJI NOOOVBOW -0x3007 KANJI -0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention -#0x4E00..0x9FA5 KANJI -0x4E00..0x9FFF KANJI -0xF900..0xFA2D KANJI -0xFA30..0xFA6A KANJI - - -# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) -0x4E00 KANJINUMERIC KANJI -0x4E8C KANJINUMERIC KANJI -0x4E09 KANJINUMERIC KANJI -0x56DB KANJINUMERIC KANJI -0x4E94 KANJINUMERIC KANJI -0x516D KANJINUMERIC KANJI -0x4E03 KANJINUMERIC KANJI -0x516B KANJINUMERIC KANJI -0x4E5D KANJINUMERIC KANJI -0x5341 KANJINUMERIC KANJI -0x767E KANJINUMERIC KANJI -0x5343 KANJINUMERIC KANJI -0x4E07 KANJINUMERIC KANJI -0x5104 KANJINUMERIC KANJI -0x5146 KANJINUMERIC KANJI - -# ZENKAKU -0xFF10..0xFF19 NUMERIC -0xFF21..0xFF3A ALPHA -0xFF41..0xFF5A ALPHA -0xFF01..0xFF0F SYMBOL #!->/ -0xFF1A..0xFF20 SYMBOL #:->@ -0xFF3B..0xFF40 SYMBOL #[->` -0xFF5B..0xFF65 SYMBOL #{->・ -0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form - -# OTHER SYMBOLS -0x2000..0x206F SYMBOL # General Punctuation -0x2070..0x209F NUMERIC # Superscripts and Subscripts -0x20A0..0x20CF SYMBOL # Currency Symbols -0x20D0..0x20FF SYMBOL # Combining Diaritical Marks for Symbols -0x2100..0x214F SYMBOL # Letterlike Symbols -0x2150..0x218F NUMERIC # Number forms -0x2100..0x214B SYMBOL # Letterlike Symbols -0x2190..0x21FF SYMBOL # Arrow -0x2200..0x22FF SYMBOL # Mathematical Operators -0x2300..0x23FF SYMBOL # Miscellaneuos Technical -0x2460..0x24FF SYMBOL # Enclosed NUMERICs -0x2501..0x257F SYMBOL # Box Drawing -0x2580..0x259F SYMBOL # Block Elements -0x25A0..0x25FF SYMBOL # Geometric Shapes -0x2600..0x26FE SYMBOL # Miscellaneous Symbols -0x2700..0x27BF SYMBOL # Dingbats -0x27F0..0x27FF SYMBOL # Supplemental Arrows A -0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A -0x2800..0x28FF SYMBOL # Braille Patterns -0x2900..0x297F SYMBOL # Supplemental Arrows B -0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows -0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators -0x3300..0x33FF SYMBOL -0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months -0x3000..0x303F SYMBOL # CJK Symbol and Punctuation -0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms -0xFE50..0xFE6B SYMBOL # Small Form Variants - -# added 2006/3/13 -0x3007 SYMBOL KANJINUMERIC - -# added 2018/11/30 -0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks - -# END OF TABLE diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json deleted file mode 100644 index 1b38cd5d..00000000 --- a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi_subplugin.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "systemDict" : "system_core.dic", - "oovProviderPlugin" : [ - { "class" : "com.worksap.nlp.elasticsearch.sudachi.plugin.FakeOovPlugin" } - ], - "pathRewritePlugin" : [ - { "class" : "com.worksap.nlp.sudachi.JoinOovPlugin", - "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ] } - ] -}