DNA processing
Problem 5-5.
Generate the k-mer Composition of a String
Given a string Text, its k-mer composition Compositionk(Text) is the collection of all k-mer substrings of Text (including repeated k-mers). For example,
Composition3(TATGGGGTGC) = {ATG, GGG, GGG, GGT, GTG, TAT, TGC, TGG}
Note that we have listed k-mers in lexicographic order (i.e., how they would appear in a dictionary) rather than in the order of their appearance in TATGGGGTGC. We have done this because the correct ordering of the reads is unknown when they are generated.
String Composition Problem
Generate the k-mer composition of a string.
Given: An integer k and a string Text.
Return: Compositionk(Text) (the k-mers can be provided in any order).
Sample Dataset
5
CAATCCAAC
Sample Output
AATCC
ATCCA
CAATC
CCAAC
TCCAA
Problem 5-6.
Reconstruct a String from its Genome
Find the string spelled by a genome path.
Given: A sequence of k-mers Pattern1, … , Patternn such that the last k – 1 symbols of Patterni are equal to the first k – 1 symbols of Patterni+1 for i from 1 to n-1.
Return: A string Text of length k+n-1 where the i-th k-mer in Text is equal to Patterni for all i.
Sample Dataset
ACCGA
CCGAA
CGAAG
GAAGC
AAGCT
Sample Output
ACCGAAGCT
Problem 5-7.
Reconstruct a String from its k-mer Composition
Given: An integer k followed by a list of k-mers Patterns.
Return: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)
Sample Dataset
4
CTTA
ACCA
TACC
GGCT
GCTT
TTAC
Sample Output
GGCTTACCA
Solution
Program5_5.java
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
public class Program5_5 {
/**
* @return the k-mers in any order.
*/
public static List<String> composition(final int k, final String text) {
final List<String> kmers = new ArrayList<>();
for (int i = 0; i + k <= text.length(); i++) {
final String kmer = text.substring(i, i + k);
kmers.add(kmer);
}
Collections.sort(kmers);
return kmers;
}
// test program
public static void main(String[] args) {
final List<String> kmers = composition(5, “CAATCCAAC”);
for (final String kmer : kmers) {
System.out.println(kmer);
}
}
}
Program5_6.java
import java.util.List;
import java.util.Arrays;
public class Program5_6 {
/**
* @return the string spelled by the genome path.
*/
public static String construct(final List<String> path) {
final StringBuilder result = new StringBuilder();
if (!path.isEmpty()) {
result.append(path.get(0));
for (int i = 1; i < path.size(); i++) {
final String node = path.get(i);
result.append(node.charAt(node.length() – 1));
}
}
return result.toString();
}
// test program
public static void main(String[] args) {
final List<String> path = Arrays.asList(“ACCGA”, “CCGAA”, “CGAAG”, “GAAGC”, “AAGCT”);
System.out.println(construct(path));
}
}
Program5_7.java
import java.util.List;
import java.util.Arrays;
import java.util.Set;
import java.util.HashSet;
public class Program5_7 {
/**
* Reconstruct a string from its k-mer composition.
*/
public static String construct(final int k, final List<String> kmers) {
if (!kmers.isEmpty()) {
for (int i = 0; i < kmers.size(); i++) {
// starting from each kmer as the first part
// check if we can traverse all kmers as a path.
final StringBuilder result = new StringBuilder(kmers.get(i));
// construct the index list for unvisited kmers
final Set<Integer> unvisited = new HashSet<>();
for (int j = 0; j < kmers.size(); j++) {
if (j != i) {
unvisited.add(j);
}
}
if (traverse(k, kmers, unvisited, result)) {
return result.toString();
}
}
}
return “”;
}
// a recursive function to check if we can reach all kmers.
private static boolean traverse(final int k, final List<String> kmers, final Set<Integer> unvisited, final StringBuilder builder) {
if (unvisited.isEmpty()) {
return true;
}
final String prefix = builder.substring(builder.length() – k + 1);
final Set<Integer> remaining = new HashSet<>();
for (final int next : unvisited) {
final String kmer = kmers.get(next);
if (kmer.startsWith(prefix)) {
// try to use this one as the next one on the path
builder.append(kmer.charAt(k – 1));
remaining.addAll(unvisited);
remaining.remove(next);
// continue trying with remaining kmers
if (traverse(k, kmers, remaining, builder)) {
return true;
}
// recover the temporary path for next trial
remaining.clear();
builder.delete(builder.length() – 1, builder.length());
}
}
return false;
}
// test program
public static void main(String[] args) {
final List<String> kmers = Arrays.asList(
“CTTA”, “ACCA”, “TACC”, “GGCT”, “GCTT”, “TTAC”);
System.out.println(construct(4, kmers));
}
}