From 205dec3a025f52ec34def94c1dbc3116d2dfa303 Mon Sep 17 00:00:00 2001 From: Deepak Bhagat Date: Sun, 26 Apr 2026 16:47:40 +0530 Subject: [PATCH 1/3] feat(components): add missing languages to Code Text Splitter The Code Text Splitter only exposed 16 languages natively supported by @langchain/textsplitters. The Python LangChain library supports many more. This adds 9 additional languages (c, csharp, cobol, elixir, haskell, kotlin, lua, powershell, ts) with custom separators ported from Python LangChain, while keeping existing languages on the native fromLanguage() path. --- .../CodeTextSplitter/CodeTextSplitter.ts | 118 +++++++++++++++--- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts b/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts index 292486ac2bf..4d369811684 100644 --- a/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts +++ b/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts @@ -3,9 +3,48 @@ import { getBaseClasses } from '../../../src/utils' import { RecursiveCharacterTextSplitter, RecursiveCharacterTextSplitterParams, - SupportedTextSplitterLanguage + SupportedTextSplitterLanguage, + SupportedTextSplitterLanguages } from '@langchain/textsplitters' +const extraLanguageSeparators: Record = { + c: ['\nclass ', '\nvoid ', '\nint ', '\nfloat ', '\ndouble ', '\nif ', '\nfor ', '\nwhile ', '\nswitch ', '\ncase ', '\n\n', '\n', ' ', ''], + csharp: [ + '\ninterface ', '\nenum ', '\nimplements ', '\ndelegate ', '\nevent ', '\nclass ', '\nabstract ', + '\npublic ', '\nprotected ', '\nprivate ', '\nstatic ', '\nreturn ', + '\nif ', '\ncontinue ', '\nfor ', '\nforeach ', '\nwhile ', '\nswitch ', '\nbreak ', '\ncase ', '\nelse ', + '\ntry ', '\nthrow ', '\nfinally ', '\ncatch ', '\n\n', '\n', ' ', '' + ], + cobol: [ + '\nIDENTIFICATION DIVISION.', '\nENVIRONMENT DIVISION.', '\nDATA DIVISION.', '\nPROCEDURE DIVISION.', + '\nWORKING-STORAGE SECTION.', '\nLINKAGE SECTION.', '\nFILE SECTION.', '\nINPUT-OUTPUT SECTION.', + '\nOPEN ', '\nCLOSE ', '\nREAD ', '\nWRITE ', '\nIF ', '\nELSE ', '\nMOVE ', '\nPERFORM ', + '\nUNTIL ', '\nVARYING ', '\nACCEPT ', '\nDISPLAY ', '\nSTOP RUN.', '\n', ' ', '' + ], + elixir: [ + '\ndef ', '\ndefp ', '\ndefmodule ', '\ndefprotocol ', '\ndefmacro ', '\ndefmacrop ', + '\nif ', '\nunless ', '\nwhile ', '\ncase ', '\ncond ', '\nwith ', '\nfor ', '\ndo ', '\n\n', '\n', ' ', '' + ], + haskell: [ + '\nmain :: ', '\nmain = ', '\nlet ', '\nin ', '\ndo ', '\nwhere ', '\n:: ', '\n= ', + '\ndata ', '\nnewtype ', '\ntype ', '\nmodule ', '\nimport ', '\nqualified ', '\nimport qualified ', + '\nclass ', '\ninstance ', '\ncase ', '\n| ', '\n= {', '\n, ', '\n\n', '\n', ' ', '' + ], + kotlin: [ + '\nclass ', '\npublic ', '\nprotected ', '\nprivate ', '\ninternal ', '\ncompanion ', '\nfun ', '\nval ', '\nvar ', + '\nif ', '\nfor ', '\nwhile ', '\nwhen ', '\ncase ', '\nelse ', '\n\n', '\n', ' ', '' + ], + lua: ['\nlocal ', '\nfunction ', '\nif ', '\nfor ', '\nwhile ', '\nrepeat ', '\n\n', '\n', ' ', ''], + powershell: [ + '\nfunction ', '\nparam ', '\nif ', '\nforeach ', '\nfor ', '\nwhile ', '\nswitch ', + '\nclass ', '\ntry ', '\ncatch ', '\nfinally ', '\n\n', '\n', ' ', '' + ], + ts: [ + '\nenum ', '\ninterface ', '\nnamespace ', '\ntype ', '\nclass ', '\nfunction ', '\nconst ', '\nlet ', '\nvar ', + '\nif ', '\nfor ', '\nwhile ', '\nswitch ', '\ncase ', '\ndefault ', '\n\n', '\n', ' ', '' + ] +} + class CodeTextSplitter_TextSplitters implements INode { label: string name: string @@ -31,14 +70,38 @@ class CodeTextSplitter_TextSplitters implements INode { name: 'language', type: 'options', options: [ + { + label: 'c', + name: 'c' + }, + { + label: 'cobol', + name: 'cobol' + }, { label: 'cpp', name: 'cpp' }, + { + label: 'csharp', + name: 'csharp' + }, + { + label: 'elixir', + name: 'elixir' + }, { label: 'go', name: 'go' }, + { + label: 'haskell', + name: 'haskell' + }, + { + label: 'html', + name: 'html' + }, { label: 'java', name: 'java' @@ -47,10 +110,30 @@ class CodeTextSplitter_TextSplitters implements INode { label: 'js', name: 'js' }, + { + label: 'kotlin', + name: 'kotlin' + }, + { + label: 'latex', + name: 'latex' + }, + { + label: 'lua', + name: 'lua' + }, + { + label: 'markdown', + name: 'markdown' + }, { label: 'php', name: 'php' }, + { + label: 'powershell', + name: 'powershell' + }, { label: 'proto', name: 'proto' @@ -76,24 +159,16 @@ class CodeTextSplitter_TextSplitters implements INode { name: 'scala' }, { - label: 'swift', - name: 'swift' - }, - { - label: 'markdown', - name: 'markdown' - }, - { - label: 'latex', - name: 'latex' + label: 'sol', + name: 'sol' }, { - label: 'html', - name: 'html' + label: 'swift', + name: 'swift' }, { - label: 'sol', - name: 'sol' + label: 'ts', + name: 'ts' } ] }, @@ -118,16 +193,23 @@ class CodeTextSplitter_TextSplitters implements INode { async init(nodeData: INodeData): Promise { const chunkSize = nodeData.inputs?.chunkSize as string const chunkOverlap = nodeData.inputs?.chunkOverlap as string - const language = nodeData.inputs?.language as SupportedTextSplitterLanguage + const language = nodeData.inputs?.language as string const obj = {} as RecursiveCharacterTextSplitterParams if (chunkSize) obj.chunkSize = parseInt(chunkSize, 10) if (chunkOverlap) obj.chunkOverlap = parseInt(chunkOverlap, 10) - const splitter = RecursiveCharacterTextSplitter.fromLanguage(language, obj) + if ((SupportedTextSplitterLanguages as readonly string[]).includes(language)) { + return RecursiveCharacterTextSplitter.fromLanguage(language as SupportedTextSplitterLanguage, obj) + } + + const separators = extraLanguageSeparators[language] + if (separators) { + return new RecursiveCharacterTextSplitter({ ...obj, separators }) + } - return splitter + return RecursiveCharacterTextSplitter.fromLanguage(language as SupportedTextSplitterLanguage, obj) } } module.exports = { nodeClass: CodeTextSplitter_TextSplitters } From 6bf2122bafeda759a7b5bddf1dbe99567df704eb Mon Sep 17 00:00:00 2001 From: Deepak Bhagat Date: Sun, 26 Apr 2026 17:04:23 +0530 Subject: [PATCH 2/3] fix: address review feedback on language separators C: replace class with struct/union/enum (C has no class keyword). C#: remove implements (C# uses :), add namespace and struct. Elixir: remove while (not a keyword in Elixir). Kotlin: remove case (Kotlin uses when). Fallback: return default splitter instead of calling fromLanguage with an unsupported language. --- .../textsplitters/CodeTextSplitter/CodeTextSplitter.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts b/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts index 4d369811684..6a3decfb460 100644 --- a/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts +++ b/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts @@ -8,9 +8,9 @@ import { } from '@langchain/textsplitters' const extraLanguageSeparators: Record = { - c: ['\nclass ', '\nvoid ', '\nint ', '\nfloat ', '\ndouble ', '\nif ', '\nfor ', '\nwhile ', '\nswitch ', '\ncase ', '\n\n', '\n', ' ', ''], + c: ['\nstruct ', '\nunion ', '\nenum ', '\nvoid ', '\nint ', '\nfloat ', '\ndouble ', '\nif ', '\nfor ', '\nwhile ', '\nswitch ', '\ncase ', '\n\n', '\n', ' ', ''], csharp: [ - '\ninterface ', '\nenum ', '\nimplements ', '\ndelegate ', '\nevent ', '\nclass ', '\nabstract ', + '\nnamespace ', '\ninterface ', '\nenum ', '\nstruct ', '\ndelegate ', '\nevent ', '\nclass ', '\nabstract ', '\npublic ', '\nprotected ', '\nprivate ', '\nstatic ', '\nreturn ', '\nif ', '\ncontinue ', '\nfor ', '\nforeach ', '\nwhile ', '\nswitch ', '\nbreak ', '\ncase ', '\nelse ', '\ntry ', '\nthrow ', '\nfinally ', '\ncatch ', '\n\n', '\n', ' ', '' @@ -23,7 +23,7 @@ const extraLanguageSeparators: Record = { ], elixir: [ '\ndef ', '\ndefp ', '\ndefmodule ', '\ndefprotocol ', '\ndefmacro ', '\ndefmacrop ', - '\nif ', '\nunless ', '\nwhile ', '\ncase ', '\ncond ', '\nwith ', '\nfor ', '\ndo ', '\n\n', '\n', ' ', '' + '\nif ', '\nunless ', '\ncase ', '\ncond ', '\nwith ', '\nfor ', '\ndo ', '\n\n', '\n', ' ', '' ], haskell: [ '\nmain :: ', '\nmain = ', '\nlet ', '\nin ', '\ndo ', '\nwhere ', '\n:: ', '\n= ', @@ -32,7 +32,7 @@ const extraLanguageSeparators: Record = { ], kotlin: [ '\nclass ', '\npublic ', '\nprotected ', '\nprivate ', '\ninternal ', '\ncompanion ', '\nfun ', '\nval ', '\nvar ', - '\nif ', '\nfor ', '\nwhile ', '\nwhen ', '\ncase ', '\nelse ', '\n\n', '\n', ' ', '' + '\nif ', '\nfor ', '\nwhile ', '\nwhen ', '\nelse ', '\n\n', '\n', ' ', '' ], lua: ['\nlocal ', '\nfunction ', '\nif ', '\nfor ', '\nwhile ', '\nrepeat ', '\n\n', '\n', ' ', ''], powershell: [ @@ -209,7 +209,7 @@ class CodeTextSplitter_TextSplitters implements INode { return new RecursiveCharacterTextSplitter({ ...obj, separators }) } - return RecursiveCharacterTextSplitter.fromLanguage(language as SupportedTextSplitterLanguage, obj) + return new RecursiveCharacterTextSplitter(obj) } } module.exports = { nodeClass: CodeTextSplitter_TextSplitters } From e738e9fe28048ba232b5ee560d74af97eb35ea74 Mon Sep 17 00:00:00 2001 From: Deepak Bhagat Date: Mon, 27 Apr 2026 16:14:54 +0530 Subject: [PATCH 3/3] style: fix prettier formatting in CodeTextSplitter --- .../CodeTextSplitter/CodeTextSplitter.ts | 186 ++++++++++++++++-- 1 file changed, 166 insertions(+), 20 deletions(-) diff --git a/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts b/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts index 6a3decfb460..05b6cf4b9a2 100644 --- a/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts +++ b/packages/components/nodes/textsplitters/CodeTextSplitter/CodeTextSplitter.ts @@ -8,40 +8,186 @@ import { } from '@langchain/textsplitters' const extraLanguageSeparators: Record = { - c: ['\nstruct ', '\nunion ', '\nenum ', '\nvoid ', '\nint ', '\nfloat ', '\ndouble ', '\nif ', '\nfor ', '\nwhile ', '\nswitch ', '\ncase ', '\n\n', '\n', ' ', ''], + c: [ + '\nstruct ', + '\nunion ', + '\nenum ', + '\nvoid ', + '\nint ', + '\nfloat ', + '\ndouble ', + '\nif ', + '\nfor ', + '\nwhile ', + '\nswitch ', + '\ncase ', + '\n\n', + '\n', + ' ', + '' + ], csharp: [ - '\nnamespace ', '\ninterface ', '\nenum ', '\nstruct ', '\ndelegate ', '\nevent ', '\nclass ', '\nabstract ', - '\npublic ', '\nprotected ', '\nprivate ', '\nstatic ', '\nreturn ', - '\nif ', '\ncontinue ', '\nfor ', '\nforeach ', '\nwhile ', '\nswitch ', '\nbreak ', '\ncase ', '\nelse ', - '\ntry ', '\nthrow ', '\nfinally ', '\ncatch ', '\n\n', '\n', ' ', '' + '\nnamespace ', + '\ninterface ', + '\nenum ', + '\nstruct ', + '\ndelegate ', + '\nevent ', + '\nclass ', + '\nabstract ', + '\npublic ', + '\nprotected ', + '\nprivate ', + '\nstatic ', + '\nreturn ', + '\nif ', + '\ncontinue ', + '\nfor ', + '\nforeach ', + '\nwhile ', + '\nswitch ', + '\nbreak ', + '\ncase ', + '\nelse ', + '\ntry ', + '\nthrow ', + '\nfinally ', + '\ncatch ', + '\n\n', + '\n', + ' ', + '' ], cobol: [ - '\nIDENTIFICATION DIVISION.', '\nENVIRONMENT DIVISION.', '\nDATA DIVISION.', '\nPROCEDURE DIVISION.', - '\nWORKING-STORAGE SECTION.', '\nLINKAGE SECTION.', '\nFILE SECTION.', '\nINPUT-OUTPUT SECTION.', - '\nOPEN ', '\nCLOSE ', '\nREAD ', '\nWRITE ', '\nIF ', '\nELSE ', '\nMOVE ', '\nPERFORM ', - '\nUNTIL ', '\nVARYING ', '\nACCEPT ', '\nDISPLAY ', '\nSTOP RUN.', '\n', ' ', '' + '\nIDENTIFICATION DIVISION.', + '\nENVIRONMENT DIVISION.', + '\nDATA DIVISION.', + '\nPROCEDURE DIVISION.', + '\nWORKING-STORAGE SECTION.', + '\nLINKAGE SECTION.', + '\nFILE SECTION.', + '\nINPUT-OUTPUT SECTION.', + '\nOPEN ', + '\nCLOSE ', + '\nREAD ', + '\nWRITE ', + '\nIF ', + '\nELSE ', + '\nMOVE ', + '\nPERFORM ', + '\nUNTIL ', + '\nVARYING ', + '\nACCEPT ', + '\nDISPLAY ', + '\nSTOP RUN.', + '\n', + ' ', + '' ], elixir: [ - '\ndef ', '\ndefp ', '\ndefmodule ', '\ndefprotocol ', '\ndefmacro ', '\ndefmacrop ', - '\nif ', '\nunless ', '\ncase ', '\ncond ', '\nwith ', '\nfor ', '\ndo ', '\n\n', '\n', ' ', '' + '\ndef ', + '\ndefp ', + '\ndefmodule ', + '\ndefprotocol ', + '\ndefmacro ', + '\ndefmacrop ', + '\nif ', + '\nunless ', + '\ncase ', + '\ncond ', + '\nwith ', + '\nfor ', + '\ndo ', + '\n\n', + '\n', + ' ', + '' ], haskell: [ - '\nmain :: ', '\nmain = ', '\nlet ', '\nin ', '\ndo ', '\nwhere ', '\n:: ', '\n= ', - '\ndata ', '\nnewtype ', '\ntype ', '\nmodule ', '\nimport ', '\nqualified ', '\nimport qualified ', - '\nclass ', '\ninstance ', '\ncase ', '\n| ', '\n= {', '\n, ', '\n\n', '\n', ' ', '' + '\nmain :: ', + '\nmain = ', + '\nlet ', + '\nin ', + '\ndo ', + '\nwhere ', + '\n:: ', + '\n= ', + '\ndata ', + '\nnewtype ', + '\ntype ', + '\nmodule ', + '\nimport ', + '\nqualified ', + '\nimport qualified ', + '\nclass ', + '\ninstance ', + '\ncase ', + '\n| ', + '\n= {', + '\n, ', + '\n\n', + '\n', + ' ', + '' ], kotlin: [ - '\nclass ', '\npublic ', '\nprotected ', '\nprivate ', '\ninternal ', '\ncompanion ', '\nfun ', '\nval ', '\nvar ', - '\nif ', '\nfor ', '\nwhile ', '\nwhen ', '\nelse ', '\n\n', '\n', ' ', '' + '\nclass ', + '\npublic ', + '\nprotected ', + '\nprivate ', + '\ninternal ', + '\ncompanion ', + '\nfun ', + '\nval ', + '\nvar ', + '\nif ', + '\nfor ', + '\nwhile ', + '\nwhen ', + '\nelse ', + '\n\n', + '\n', + ' ', + '' ], lua: ['\nlocal ', '\nfunction ', '\nif ', '\nfor ', '\nwhile ', '\nrepeat ', '\n\n', '\n', ' ', ''], powershell: [ - '\nfunction ', '\nparam ', '\nif ', '\nforeach ', '\nfor ', '\nwhile ', '\nswitch ', - '\nclass ', '\ntry ', '\ncatch ', '\nfinally ', '\n\n', '\n', ' ', '' + '\nfunction ', + '\nparam ', + '\nif ', + '\nforeach ', + '\nfor ', + '\nwhile ', + '\nswitch ', + '\nclass ', + '\ntry ', + '\ncatch ', + '\nfinally ', + '\n\n', + '\n', + ' ', + '' ], ts: [ - '\nenum ', '\ninterface ', '\nnamespace ', '\ntype ', '\nclass ', '\nfunction ', '\nconst ', '\nlet ', '\nvar ', - '\nif ', '\nfor ', '\nwhile ', '\nswitch ', '\ncase ', '\ndefault ', '\n\n', '\n', ' ', '' + '\nenum ', + '\ninterface ', + '\nnamespace ', + '\ntype ', + '\nclass ', + '\nfunction ', + '\nconst ', + '\nlet ', + '\nvar ', + '\nif ', + '\nfor ', + '\nwhile ', + '\nswitch ', + '\ncase ', + '\ndefault ', + '\n\n', + '\n', + ' ', + '' ] }